Mercurial > mplayer.hg
annotate liba52/imdct.c @ 14802:63b1cc62fc9b
MEncoder multiple files patch by Oded Shimon (ods15)
Seems to work, or at least not to cause problems with existing
functionality (encoding single files).
Please test and report bugs, if there are any!
author | rfelker |
---|---|
date | Fri, 25 Feb 2005 02:32:29 +0000 |
parents | f881c918739b |
children | 07f1e7669772 |
rev | line source |
---|---|
3394 | 1 /* |
2 * imdct.c | |
3 * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org> | |
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> | |
5 * | |
6 * This file is part of a52dec, a free ATSC A-52 stream decoder. | |
7 * See http://liba52.sourceforge.net/ for updates. | |
8 * | |
9 * a52dec is free software; you can redistribute it and/or modify | |
10 * it under the terms of the GNU General Public License as published by | |
11 * the Free Software Foundation; either version 2 of the License, or | |
12 * (at your option) any later version. | |
13 * | |
14 * a52dec is distributed in the hope that it will be useful, | |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 * GNU General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU General Public License | |
20 * along with this program; if not, write to the Free Software | |
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
3579 | 22 * |
23 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) | |
3884 | 24 * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru> |
25 * michael did port them from libac3 (untested, perhaps totally broken) | |
9001
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
26 * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org) |
3394 | 27 */ |
28 | |
29 #include "config.h" | |
30 | |
31 #include <math.h> | |
32 #include <stdio.h> | |
33 #ifndef M_PI | |
34 #define M_PI 3.1415926535897932384626433832795029 | |
35 #endif | |
36 #include <inttypes.h> | |
37 | |
38 #include "a52.h" | |
39 #include "a52_internal.h" | |
40 #include "mm_accel.h" | |
4247
2dbd637ffe05
mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents:
3908
diff
changeset
|
41 #include "mangle.h" |
3394 | 42 |
3884 | 43 #ifdef RUNTIME_CPUDETECT |
44 #undef HAVE_3DNOWEX | |
45 #endif | |
46 | |
47 #define USE_AC3_C | |
48 | |
3394 | 49 void (* imdct_256) (sample_t data[], sample_t delay[], sample_t bias); |
50 void (* imdct_512) (sample_t data[], sample_t delay[], sample_t bias); | |
51 | |
52 typedef struct complex_s { | |
53 sample_t real; | |
54 sample_t imag; | |
55 } complex_t; | |
56 | |
3884 | 57 static void fft_128p(complex_t *a); |
58 | |
12303
f881c918739b
attribute_used patch by (VMiklos <mamajom at axelero dot hu>)
michael
parents:
9122
diff
changeset
|
59 static const int pm128[128] attribute_used __attribute__((aligned(16))) = |
3884 | 60 { |
61 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120, | |
62 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124, | |
63 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122, | |
64 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126, | |
65 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121, | |
66 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125, | |
67 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123, | |
68 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127 | |
69 }; | |
3394 | 70 |
71 /* 128 point bit-reverse LUT */ | |
12303
f881c918739b
attribute_used patch by (VMiklos <mamajom at axelero dot hu>)
michael
parents:
9122
diff
changeset
|
72 static uint8_t attribute_used bit_reverse_512[] = { |
3394 | 73 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70, |
74 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78, | |
75 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74, | |
76 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c, | |
77 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72, | |
78 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a, | |
79 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76, | |
80 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e, | |
81 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71, | |
82 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79, | |
83 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75, | |
84 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d, | |
85 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73, | |
86 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b, | |
87 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77, | |
88 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f}; | |
89 | |
90 static uint8_t bit_reverse_256[] = { | |
91 0x00, 0x20, 0x10, 0x30, 0x08, 0x28, 0x18, 0x38, | |
92 0x04, 0x24, 0x14, 0x34, 0x0c, 0x2c, 0x1c, 0x3c, | |
93 0x02, 0x22, 0x12, 0x32, 0x0a, 0x2a, 0x1a, 0x3a, | |
94 0x06, 0x26, 0x16, 0x36, 0x0e, 0x2e, 0x1e, 0x3e, | |
95 0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39, | |
96 0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d, | |
97 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b, | |
98 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f}; | |
99 | |
3579 | 100 #ifdef ARCH_X86 |
3508 | 101 // NOTE: SSE needs 16byte alignment or it will segfault |
3581 | 102 // |
3508 | 103 static complex_t __attribute__((aligned(16))) buf[128]; |
3581 | 104 static float __attribute__((aligned(16))) sseSinCos1c[256]; |
105 static float __attribute__((aligned(16))) sseSinCos1d[256]; | |
12303
f881c918739b
attribute_used patch by (VMiklos <mamajom at axelero dot hu>)
michael
parents:
9122
diff
changeset
|
106 static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1}; |
3534 | 107 //static float __attribute__((aligned(16))) sseW0[4]; |
108 static float __attribute__((aligned(16))) sseW1[8]; | |
109 static float __attribute__((aligned(16))) sseW2[16]; | |
110 static float __attribute__((aligned(16))) sseW3[32]; | |
111 static float __attribute__((aligned(16))) sseW4[64]; | |
112 static float __attribute__((aligned(16))) sseW5[128]; | |
113 static float __attribute__((aligned(16))) sseW6[256]; | |
114 static float __attribute__((aligned(16))) *sseW[7]= | |
115 {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6}; | |
3553 | 116 static float __attribute__((aligned(16))) sseWindow[512]; |
3508 | 117 #else |
9001
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
118 static complex_t __attribute__((aligned(16))) buf[128]; |
3508 | 119 #endif |
3394 | 120 |
121 /* Twiddle factor LUT */ | |
9001
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
122 static complex_t __attribute__((aligned(16))) w_1[1]; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
123 static complex_t __attribute__((aligned(16))) w_2[2]; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
124 static complex_t __attribute__((aligned(16))) w_4[4]; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
125 static complex_t __attribute__((aligned(16))) w_8[8]; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
126 static complex_t __attribute__((aligned(16))) w_16[16]; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
127 static complex_t __attribute__((aligned(16))) w_32[32]; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
128 static complex_t __attribute__((aligned(16))) w_64[64]; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
129 static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64}; |
3394 | 130 |
131 /* Twiddle factors for IMDCT */ | |
9001
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
132 static sample_t __attribute__((aligned(16))) xcos1[128]; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
133 static sample_t __attribute__((aligned(16))) xsin1[128]; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
134 static sample_t __attribute__((aligned(16))) xcos2[64]; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
135 static sample_t __attribute__((aligned(16))) xsin2[64]; |
3394 | 136 |
137 /* Windowing function for Modified DCT - Thank you acroread */ | |
138 sample_t imdct_window[] = { | |
139 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130, | |
140 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443, | |
141 0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061, | |
142 0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121, | |
143 0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770, | |
144 0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153, | |
145 0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389, | |
146 0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563, | |
147 0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699, | |
148 0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757, | |
149 0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626, | |
150 0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126, | |
151 0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019, | |
152 0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031, | |
153 0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873, | |
154 0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269, | |
155 0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981, | |
156 0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831, | |
157 0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716, | |
158 0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610, | |
159 0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560, | |
160 0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674, | |
161 0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099, | |
162 0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994, | |
163 0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513, | |
164 0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788, | |
165 0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919, | |
166 0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974, | |
167 0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993, | |
168 0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999, | |
169 0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, | |
170 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 }; | |
171 | |
172 | |
173 static inline void swap_cmplx(complex_t *a, complex_t *b) | |
174 { | |
175 complex_t tmp; | |
176 | |
177 tmp = *a; | |
178 *a = *b; | |
179 *b = tmp; | |
180 } | |
181 | |
182 | |
183 | |
184 static inline complex_t cmplx_mult(complex_t a, complex_t b) | |
185 { | |
186 complex_t ret; | |
187 | |
188 ret.real = a.real * b.real - a.imag * b.imag; | |
189 ret.imag = a.real * b.imag + a.imag * b.real; | |
190 | |
191 return ret; | |
192 } | |
193 | |
194 void | |
195 imdct_do_512(sample_t data[],sample_t delay[], sample_t bias) | |
196 { | |
8254
772d6d27fd66
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
4497
diff
changeset
|
197 int i; |
772d6d27fd66
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
4497
diff
changeset
|
198 #ifndef USE_AC3_C |
772d6d27fd66
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
4497
diff
changeset
|
199 int k; |
3394 | 200 int p,q; |
201 int m; | |
202 int two_m; | |
203 int two_m_plus_one; | |
204 | |
8254
772d6d27fd66
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
4497
diff
changeset
|
205 sample_t tmp_b_i; |
772d6d27fd66
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
4497
diff
changeset
|
206 sample_t tmp_b_r; |
772d6d27fd66
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
4497
diff
changeset
|
207 #endif |
3394 | 208 sample_t tmp_a_i; |
209 sample_t tmp_a_r; | |
210 | |
211 sample_t *data_ptr; | |
212 sample_t *delay_ptr; | |
213 sample_t *window_ptr; | |
214 | |
215 /* 512 IMDCT with source and dest data in 'data' */ | |
216 | |
3884 | 217 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ |
3394 | 218 for( i=0; i < 128; i++) { |
219 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ | |
3884 | 220 #ifdef USE_AC3_C |
221 int j= pm128[i]; | |
222 #else | |
223 int j= bit_reverse_512[i]; | |
224 #endif | |
225 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]); | |
226 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); | |
3394 | 227 } |
228 | |
229 /* FFT Merge */ | |
3549 | 230 /* unoptimized variant |
231 for (m=1; m < 7; m++) { | |
232 if(m) | |
233 two_m = (1 << m); | |
234 else | |
235 two_m = 1; | |
236 | |
237 two_m_plus_one = (1 << (m+1)); | |
238 | |
239 for(i = 0; i < 128; i += two_m_plus_one) { | |
240 for(k = 0; k < two_m; k++) { | |
241 p = k + i; | |
242 q = p + two_m; | |
3508 | 243 tmp_a_r = buf[p].real; |
244 tmp_a_i = buf[p].imag; | |
3549 | 245 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; |
246 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; | |
3508 | 247 buf[p].real = tmp_a_r + tmp_b_r; |
248 buf[p].imag = tmp_a_i + tmp_b_i; | |
249 buf[q].real = tmp_a_r - tmp_b_r; | |
250 buf[q].imag = tmp_a_i - tmp_b_i; | |
3549 | 251 } |
252 } | |
253 } | |
254 */ | |
3884 | 255 #ifdef USE_AC3_C |
256 fft_128p (&buf[0]); | |
257 #else | |
258 | |
3623 | 259 /* 1. iteration */ |
3579 | 260 for(i = 0; i < 128; i += 2) { |
261 tmp_a_r = buf[i].real; | |
262 tmp_a_i = buf[i].imag; | |
263 tmp_b_r = buf[i+1].real; | |
264 tmp_b_i = buf[i+1].imag; | |
265 buf[i].real = tmp_a_r + tmp_b_r; | |
266 buf[i].imag = tmp_a_i + tmp_b_i; | |
267 buf[i+1].real = tmp_a_r - tmp_b_r; | |
268 buf[i+1].imag = tmp_a_i - tmp_b_i; | |
269 } | |
270 | |
3623 | 271 /* 2. iteration */ |
3579 | 272 // Note w[1]={{1,0}, {0,-1}} |
273 for(i = 0; i < 128; i += 4) { | |
274 tmp_a_r = buf[i].real; | |
275 tmp_a_i = buf[i].imag; | |
276 tmp_b_r = buf[i+2].real; | |
277 tmp_b_i = buf[i+2].imag; | |
278 buf[i].real = tmp_a_r + tmp_b_r; | |
279 buf[i].imag = tmp_a_i + tmp_b_i; | |
280 buf[i+2].real = tmp_a_r - tmp_b_r; | |
281 buf[i+2].imag = tmp_a_i - tmp_b_i; | |
282 tmp_a_r = buf[i+1].real; | |
283 tmp_a_i = buf[i+1].imag; | |
284 tmp_b_r = buf[i+3].imag; | |
285 tmp_b_i = buf[i+3].real; | |
286 buf[i+1].real = tmp_a_r + tmp_b_r; | |
287 buf[i+1].imag = tmp_a_i - tmp_b_i; | |
288 buf[i+3].real = tmp_a_r - tmp_b_r; | |
289 buf[i+3].imag = tmp_a_i + tmp_b_i; | |
290 } | |
291 | |
3623 | 292 /* 3. iteration */ |
3579 | 293 for(i = 0; i < 128; i += 8) { |
294 tmp_a_r = buf[i].real; | |
295 tmp_a_i = buf[i].imag; | |
296 tmp_b_r = buf[i+4].real; | |
297 tmp_b_i = buf[i+4].imag; | |
298 buf[i].real = tmp_a_r + tmp_b_r; | |
299 buf[i].imag = tmp_a_i + tmp_b_i; | |
300 buf[i+4].real = tmp_a_r - tmp_b_r; | |
301 buf[i+4].imag = tmp_a_i - tmp_b_i; | |
302 tmp_a_r = buf[1+i].real; | |
303 tmp_a_i = buf[1+i].imag; | |
304 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; | |
305 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; | |
306 buf[1+i].real = tmp_a_r + tmp_b_r; | |
307 buf[1+i].imag = tmp_a_i + tmp_b_i; | |
308 buf[i+5].real = tmp_a_r - tmp_b_r; | |
309 buf[i+5].imag = tmp_a_i - tmp_b_i; | |
310 tmp_a_r = buf[i+2].real; | |
311 tmp_a_i = buf[i+2].imag; | |
312 tmp_b_r = buf[i+6].imag; | |
313 tmp_b_i = - buf[i+6].real; | |
314 buf[i+2].real = tmp_a_r + tmp_b_r; | |
315 buf[i+2].imag = tmp_a_i + tmp_b_i; | |
316 buf[i+6].real = tmp_a_r - tmp_b_r; | |
317 buf[i+6].imag = tmp_a_i - tmp_b_i; | |
318 tmp_a_r = buf[i+3].real; | |
319 tmp_a_i = buf[i+3].imag; | |
320 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; | |
321 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; | |
322 buf[i+3].real = tmp_a_r + tmp_b_r; | |
323 buf[i+3].imag = tmp_a_i + tmp_b_i; | |
324 buf[i+7].real = tmp_a_r - tmp_b_r; | |
325 buf[i+7].imag = tmp_a_i - tmp_b_i; | |
326 } | |
327 | |
3623 | 328 /* 4-7. iterations */ |
3579 | 329 for (m=3; m < 7; m++) { |
330 two_m = (1 << m); | |
331 | |
332 two_m_plus_one = two_m<<1; | |
333 | |
334 for(i = 0; i < 128; i += two_m_plus_one) { | |
335 for(k = 0; k < two_m; k++) { | |
336 int p = k + i; | |
337 int q = p + two_m; | |
338 tmp_a_r = buf[p].real; | |
339 tmp_a_i = buf[p].imag; | |
340 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; | |
341 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; | |
342 buf[p].real = tmp_a_r + tmp_b_r; | |
343 buf[p].imag = tmp_a_i + tmp_b_i; | |
344 buf[q].real = tmp_a_r - tmp_b_r; | |
345 buf[q].imag = tmp_a_i - tmp_b_i; | |
346 } | |
347 } | |
348 } | |
3884 | 349 #endif |
3579 | 350 /* Post IFFT complex multiply plus IFFT complex conjugate*/ |
351 for( i=0; i < 128; i++) { | |
352 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ | |
353 tmp_a_r = buf[i].real; | |
354 tmp_a_i = -1.0 * buf[i].imag; | |
355 buf[i].real =(tmp_a_r * xcos1[i]) - (tmp_a_i * xsin1[i]); | |
356 buf[i].imag =(tmp_a_r * xsin1[i]) + (tmp_a_i * xcos1[i]); | |
357 } | |
358 | |
359 data_ptr = data; | |
360 delay_ptr = delay; | |
361 window_ptr = imdct_window; | |
362 | |
363 /* Window and convert to real valued signal */ | |
364 for(i=0; i< 64; i++) { | |
365 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; | |
366 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; | |
367 } | |
368 | |
369 for(i=0; i< 64; i++) { | |
370 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; | |
371 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; | |
372 } | |
3884 | 373 |
3579 | 374 /* The trailing edge of the window goes into the delay line */ |
375 delay_ptr = delay; | |
376 | |
377 for(i=0; i< 64; i++) { | |
378 *delay_ptr++ = -buf[64+i].real * *--window_ptr; | |
379 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; | |
380 } | |
381 | |
382 for(i=0; i<64; i++) { | |
383 *delay_ptr++ = buf[i].imag * *--window_ptr; | |
384 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; | |
385 } | |
386 } | |
387 | |
9001
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
388 #ifdef HAVE_ALTIVEC |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
389 |
9122 | 390 #ifndef SYS_DARWIN |
391 #include <altivec.h> | |
392 #endif | |
393 | |
9001
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
394 // used to build registers permutation vectors (vcprm) |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
395 // the 's' are for words in the _s_econd vector |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
396 #define WORD_0 0x00,0x01,0x02,0x03 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
397 #define WORD_1 0x04,0x05,0x06,0x07 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
398 #define WORD_2 0x08,0x09,0x0a,0x0b |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
399 #define WORD_3 0x0c,0x0d,0x0e,0x0f |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
400 #define WORD_s0 0x10,0x11,0x12,0x13 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
401 #define WORD_s1 0x14,0x15,0x16,0x17 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
402 #define WORD_s2 0x18,0x19,0x1a,0x1b |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
403 #define WORD_s3 0x1c,0x1d,0x1e,0x1f |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
404 |
9122 | 405 #ifdef SYS_DARWIN |
9001
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
406 #define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d) |
9122 | 407 #else |
408 #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d} | |
409 #endif | |
9001
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
410 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
411 // vcprmle is used to keep the same index as in the SSE version. |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
412 // it's the same as vcprm, with the index inversed |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
413 // ('le' is Little Endian) |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
414 #define vcprmle(a,b,c,d) vcprm(d,c,b,a) |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
415 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
416 // used to build inverse/identity vectors (vcii) |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
417 // n is _n_egative, p is _p_ositive |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
418 #define FLOAT_n -1. |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
419 #define FLOAT_p 1. |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
420 |
9122 | 421 #ifdef SYS_DARWIN |
9001
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
422 #define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d) |
9122 | 423 #else |
424 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d} | |
425 #endif | |
426 | |
427 #ifdef SYS_DARWIN | |
428 #define FOUROF(a) (a) | |
429 #else | |
430 #define FOUROF(a) {a,a,a,a} | |
431 #endif | |
432 | |
9001
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
433 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
434 void |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
435 imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias) |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
436 { |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
437 int i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
438 int k; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
439 int p,q; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
440 int m; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
441 int two_m; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
442 int two_m_plus_one; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
443 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
444 sample_t tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
445 sample_t tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
446 sample_t tmp_a_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
447 sample_t tmp_a_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
448 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
449 sample_t *data_ptr; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
450 sample_t *delay_ptr; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
451 sample_t *window_ptr; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
452 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
453 /* 512 IMDCT with source and dest data in 'data' */ |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
454 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
455 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
456 for( i=0; i < 128; i++) { |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
457 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
458 int j= bit_reverse_512[i]; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
459 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
460 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
461 } |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
462 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
463 /* 1. iteration */ |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
464 for(i = 0; i < 128; i += 2) { |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
465 #if 0 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
466 tmp_a_r = buf[i].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
467 tmp_a_i = buf[i].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
468 tmp_b_r = buf[i+1].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
469 tmp_b_i = buf[i+1].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
470 buf[i].real = tmp_a_r + tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
471 buf[i].imag = tmp_a_i + tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
472 buf[i+1].real = tmp_a_r - tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
473 buf[i+1].imag = tmp_a_i - tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
474 #else |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
475 vector float temp, bufv; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
476 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
477 bufv = vec_ld(i << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
478 temp = vec_perm(bufv, bufv, vcprm(2,3,0,1)); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
479 bufv = vec_madd(bufv, vcii(p,p,n,n), temp); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
480 vec_st(bufv, i << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
481 #endif |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
482 } |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
483 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
484 /* 2. iteration */ |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
485 // Note w[1]={{1,0}, {0,-1}} |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
486 for(i = 0; i < 128; i += 4) { |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
487 #if 0 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
488 tmp_a_r = buf[i].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
489 tmp_a_i = buf[i].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
490 tmp_b_r = buf[i+2].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
491 tmp_b_i = buf[i+2].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
492 buf[i].real = tmp_a_r + tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
493 buf[i].imag = tmp_a_i + tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
494 buf[i+2].real = tmp_a_r - tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
495 buf[i+2].imag = tmp_a_i - tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
496 tmp_a_r = buf[i+1].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
497 tmp_a_i = buf[i+1].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
498 /* WARNING: im <-> re here ! */ |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
499 tmp_b_r = buf[i+3].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
500 tmp_b_i = buf[i+3].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
501 buf[i+1].real = tmp_a_r + tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
502 buf[i+1].imag = tmp_a_i - tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
503 buf[i+3].real = tmp_a_r - tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
504 buf[i+3].imag = tmp_a_i + tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
505 #else |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
506 vector float buf01, buf23, temp1, temp2; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
507 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
508 buf01 = vec_ld((i + 0) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
509 buf23 = vec_ld((i + 2) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
510 buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2)); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
511 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
512 temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
513 temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
514 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
515 vec_st(temp1, (i + 0) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
516 vec_st(temp2, (i + 2) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
517 #endif |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
518 } |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
519 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
520 /* 3. iteration */ |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
521 for(i = 0; i < 128; i += 8) { |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
522 #if 0 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
523 tmp_a_r = buf[i].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
524 tmp_a_i = buf[i].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
525 tmp_b_r = buf[i+4].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
526 tmp_b_i = buf[i+4].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
527 buf[i].real = tmp_a_r + tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
528 buf[i].imag = tmp_a_i + tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
529 buf[i+4].real = tmp_a_r - tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
530 buf[i+4].imag = tmp_a_i - tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
531 tmp_a_r = buf[1+i].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
532 tmp_a_i = buf[1+i].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
533 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
534 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
535 buf[1+i].real = tmp_a_r + tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
536 buf[1+i].imag = tmp_a_i + tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
537 buf[i+5].real = tmp_a_r - tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
538 buf[i+5].imag = tmp_a_i - tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
539 tmp_a_r = buf[i+2].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
540 tmp_a_i = buf[i+2].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
541 /* WARNING re <-> im & sign */ |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
542 tmp_b_r = buf[i+6].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
543 tmp_b_i = - buf[i+6].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
544 buf[i+2].real = tmp_a_r + tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
545 buf[i+2].imag = tmp_a_i + tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
546 buf[i+6].real = tmp_a_r - tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
547 buf[i+6].imag = tmp_a_i - tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
548 tmp_a_r = buf[i+3].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
549 tmp_a_i = buf[i+3].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
550 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
551 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
552 buf[i+3].real = tmp_a_r + tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
553 buf[i+3].imag = tmp_a_i + tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
554 buf[i+7].real = tmp_a_r - tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
555 buf[i+7].imag = tmp_a_i - tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
556 #else |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
557 vector float buf01, buf23, buf45, buf67; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
558 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
559 buf01 = vec_ld((i + 0) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
560 buf23 = vec_ld((i + 2) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
561 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
562 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
563 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
564 buf[i+5].real = tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
565 buf[i+5].imag = tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
566 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
567 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
568 buf[i+7].real = tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
569 buf[i+7].imag = tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
570 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
571 buf23 = vec_ld((i + 2) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
572 buf45 = vec_ld((i + 4) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
573 buf67 = vec_ld((i + 6) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
574 buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3)); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
575 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
576 vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
577 vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
578 vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
579 vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
580 #endif |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
581 } |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
582 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
583 /* 4-7. iterations */ |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
584 for (m=3; m < 7; m++) { |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
585 two_m = (1 << m); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
586 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
587 two_m_plus_one = two_m<<1; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
588 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
589 for(i = 0; i < 128; i += two_m_plus_one) { |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
590 for(k = 0; k < two_m; k+=2) { |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
591 #if 0 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
592 int p = k + i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
593 int q = p + two_m; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
594 tmp_a_r = buf[p].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
595 tmp_a_i = buf[p].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
596 tmp_b_r = |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
597 buf[q].real * w[m][k].real - |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
598 buf[q].imag * w[m][k].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
599 tmp_b_i = |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
600 buf[q].imag * w[m][k].real + |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
601 buf[q].real * w[m][k].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
602 buf[p].real = tmp_a_r + tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
603 buf[p].imag = tmp_a_i + tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
604 buf[q].real = tmp_a_r - tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
605 buf[q].imag = tmp_a_i - tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
606 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
607 tmp_a_r = buf[(p + 1)].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
608 tmp_a_i = buf[(p + 1)].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
609 tmp_b_r = |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
610 buf[(q + 1)].real * w[m][(k + 1)].real - |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
611 buf[(q + 1)].imag * w[m][(k + 1)].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
612 tmp_b_i = |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
613 buf[(q + 1)].imag * w[m][(k + 1)].real + |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
614 buf[(q + 1)].real * w[m][(k + 1)].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
615 buf[(p + 1)].real = tmp_a_r + tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
616 buf[(p + 1)].imag = tmp_a_i + tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
617 buf[(q + 1)].real = tmp_a_r - tmp_b_r; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
618 buf[(q + 1)].imag = tmp_a_i - tmp_b_i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
619 #else |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
620 int p = k + i; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
621 int q = p + two_m; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
622 vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4; |
9122 | 623 const vector float vczero = (const vector float)FOUROF(0.); |
9001
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
624 // first compute buf[q] and buf[q+1] |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
625 vecq = vec_ld(q << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
626 vecw = vec_ld(0, (float*)&(w[m][k])); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
627 temp1 = vec_madd(vecq, vecw, vczero); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
628 temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2)); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
629 temp2 = vec_madd(temp2, vecw, vczero); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
630 temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2)); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
631 temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3)); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
632 vecq = vec_madd(temp4, vcii(n,p,n,p), temp3); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
633 // then butterfly with buf[p] and buf[p+1] |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
634 vecp = vec_ld(p << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
635 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
636 temp1 = vec_add(vecp, vecq); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
637 temp2 = vec_sub(vecp, vecq); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
638 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
639 vec_st(temp1, p << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
640 vec_st(temp2, q << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
641 #endif |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
642 } |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
643 } |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
644 } |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
645 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
646 /* Post IFFT complex multiply plus IFFT complex conjugate*/ |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
647 for( i=0; i < 128; i+=4) { |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
648 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
649 #if 0 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
650 tmp_a_r = buf[(i + 0)].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
651 tmp_a_i = -1.0 * buf[(i + 0)].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
652 buf[(i + 0)].real = |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
653 (tmp_a_r * xcos1[(i + 0)]) - (tmp_a_i * xsin1[(i + 0)]); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
654 buf[(i + 0)].imag = |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
655 (tmp_a_r * xsin1[(i + 0)]) + (tmp_a_i * xcos1[(i + 0)]); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
656 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
657 tmp_a_r = buf[(i + 1)].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
658 tmp_a_i = -1.0 * buf[(i + 1)].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
659 buf[(i + 1)].real = |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
660 (tmp_a_r * xcos1[(i + 1)]) - (tmp_a_i * xsin1[(i + 1)]); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
661 buf[(i + 1)].imag = |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
662 (tmp_a_r * xsin1[(i + 1)]) + (tmp_a_i * xcos1[(i + 1)]); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
663 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
664 tmp_a_r = buf[(i + 2)].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
665 tmp_a_i = -1.0 * buf[(i + 2)].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
666 buf[(i + 2)].real = |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
667 (tmp_a_r * xcos1[(i + 2)]) - (tmp_a_i * xsin1[(i + 2)]); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
668 buf[(i + 2)].imag = |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
669 (tmp_a_r * xsin1[(i + 2)]) + (tmp_a_i * xcos1[(i + 2)]); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
670 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
671 tmp_a_r = buf[(i + 3)].real; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
672 tmp_a_i = -1.0 * buf[(i + 3)].imag; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
673 buf[(i + 3)].real = |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
674 (tmp_a_r * xcos1[(i + 3)]) - (tmp_a_i * xsin1[(i + 3)]); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
675 buf[(i + 3)].imag = |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
676 (tmp_a_r * xsin1[(i + 3)]) + (tmp_a_i * xcos1[(i + 3)]); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
677 #else |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
678 vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
679 vector float temp0022, temp1133, tempCS01; |
9122 | 680 const vector float vczero = (const vector float)FOUROF(0.); |
9001
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
681 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
682 bufv_0 = vec_ld((i + 0) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
683 bufv_2 = vec_ld((i + 2) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
684 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
685 cosv = vec_ld(i << 2, xcos1); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
686 sinv = vec_ld(i << 2, xsin1); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
687 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
688 temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2)); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
689 temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3)); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
690 tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1)); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
691 temp1 = vec_madd(temp0022, tempCS01, vczero); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
692 tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1)); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
693 temp2 = vec_madd(temp1133, tempCS01, vczero); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
694 bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
695 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
696 vec_st(bufv_0, (i + 0) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
697 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
698 /* idem with bufv_2 and high-order cosv/sinv */ |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
699 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
700 temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2)); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
701 temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3)); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
702 tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3)); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
703 temp1 = vec_madd(temp0022, tempCS01, vczero); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
704 tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3)); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
705 temp2 = vec_madd(temp1133, tempCS01, vczero); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
706 bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
707 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
708 vec_st(bufv_2, (i + 2) << 3, (float*)buf); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
709 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
710 #endif |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
711 } |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
712 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
713 data_ptr = data; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
714 delay_ptr = delay; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
715 window_ptr = imdct_window; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
716 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
717 /* Window and convert to real valued signal */ |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
718 for(i=0; i< 64; i++) { |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
719 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
720 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
721 } |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
722 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
723 for(i=0; i< 64; i++) { |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
724 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
725 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
726 } |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
727 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
728 /* The trailing edge of the window goes into the delay line */ |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
729 delay_ptr = delay; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
730 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
731 for(i=0; i< 64; i++) { |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
732 *delay_ptr++ = -buf[64+i].real * *--window_ptr; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
733 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
734 } |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
735 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
736 for(i=0; i<64; i++) { |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
737 *delay_ptr++ = buf[i].imag * *--window_ptr; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
738 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
739 } |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
740 } |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
741 #endif |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
742 |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
743 |
4497 | 744 // Stuff below this line is borrowed from libac3 |
745 #include "srfftp.h" | |
3579 | 746 #ifdef ARCH_X86 |
4497 | 747 #ifndef HAVE_3DNOW |
748 #define HAVE_3DNOW 1 | |
749 #endif | |
3884 | 750 #include "srfftp_3dnow.h" |
751 | |
8451 | 752 const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }}; |
753 const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }}; | |
3884 | 754 const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 }; |
755 | |
4497 | 756 #undef HAVE_3DNOWEX |
757 #include "imdct_3dnow.h" | |
758 #define HAVE_3DNOWEX | |
759 #include "imdct_3dnow.h" | |
3884 | 760 |
3579 | 761 void |
762 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias) | |
763 { | |
8254
772d6d27fd66
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
4497
diff
changeset
|
764 /* int i,k; |
772d6d27fd66
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
4497
diff
changeset
|
765 int p,q;*/ |
3579 | 766 int m; |
767 int two_m; | |
768 int two_m_plus_one; | |
769 | |
8254
772d6d27fd66
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
4497
diff
changeset
|
770 /* sample_t tmp_a_i; |
3579 | 771 sample_t tmp_a_r; |
772 sample_t tmp_b_i; | |
8254
772d6d27fd66
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
4497
diff
changeset
|
773 sample_t tmp_b_r;*/ |
3579 | 774 |
775 sample_t *data_ptr; | |
776 sample_t *delay_ptr; | |
777 sample_t *window_ptr; | |
778 | |
779 /* 512 IMDCT with source and dest data in 'data' */ | |
3623 | 780 /* see the c version (dct_do_512()), its allmost identical, just in C */ |
781 | |
3579 | 782 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ |
783 /* Bit reversed shuffling */ | |
784 asm volatile( | |
785 "xorl %%esi, %%esi \n\t" | |
4247
2dbd637ffe05
mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents:
3908
diff
changeset
|
786 "leal "MANGLE(bit_reverse_512)", %%eax \n\t" |
3579 | 787 "movl $1008, %%edi \n\t" |
788 "pushl %%ebp \n\t" //use ebp without telling gcc | |
789 ".balign 16 \n\t" | |
790 "1: \n\t" | |
3584 | 791 "movlps (%0, %%esi), %%xmm0 \n\t" // XXXI |
792 "movhps 8(%0, %%edi), %%xmm0 \n\t" // RXXI | |
793 "movlps 8(%0, %%esi), %%xmm1 \n\t" // XXXi | |
794 "movhps (%0, %%edi), %%xmm1 \n\t" // rXXi | |
795 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR | |
4247
2dbd637ffe05
mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents:
3908
diff
changeset
|
796 "movaps "MANGLE(sseSinCos1c)"(%%esi), %%xmm2\n\t" |
3584 | 797 "mulps %%xmm0, %%xmm2 \n\t" |
798 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI | |
4247
2dbd637ffe05
mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents:
3908
diff
changeset
|
799 "mulps "MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t" |
3584 | 800 "subps %%xmm0, %%xmm2 \n\t" |
3579 | 801 "movzbl (%%eax), %%edx \n\t" |
802 "movzbl 1(%%eax), %%ebp \n\t" | |
3584 | 803 "movlps %%xmm2, (%1, %%edx,8) \n\t" |
804 "movhps %%xmm2, (%1, %%ebp,8) \n\t" | |
3579 | 805 "addl $16, %%esi \n\t" |
806 "addl $2, %%eax \n\t" // avoid complex addressing for P4 crap | |
807 "subl $16, %%edi \n\t" | |
808 " jnc 1b \n\t" | |
809 "popl %%ebp \n\t"//no we didnt touch ebp *g* | |
810 :: "b" (data), "c" (buf) | |
811 : "%esi", "%edi", "%eax", "%edx" | |
812 ); | |
813 | |
814 | |
815 /* FFT Merge */ | |
816 /* unoptimized variant | |
817 for (m=1; m < 7; m++) { | |
818 if(m) | |
819 two_m = (1 << m); | |
820 else | |
821 two_m = 1; | |
822 | |
823 two_m_plus_one = (1 << (m+1)); | |
824 | |
825 for(i = 0; i < 128; i += two_m_plus_one) { | |
826 for(k = 0; k < two_m; k++) { | |
827 p = k + i; | |
828 q = p + two_m; | |
829 tmp_a_r = buf[p].real; | |
830 tmp_a_i = buf[p].imag; | |
831 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; | |
832 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; | |
833 buf[p].real = tmp_a_r + tmp_b_r; | |
834 buf[p].imag = tmp_a_i + tmp_b_i; | |
835 buf[q].real = tmp_a_r - tmp_b_r; | |
836 buf[q].imag = tmp_a_i - tmp_b_i; | |
837 } | |
838 } | |
839 } | |
840 */ | |
841 | |
3623 | 842 /* 1. iteration */ |
3549 | 843 // Note w[0][0]={1,0} |
3508 | 844 asm volatile( |
845 "xorps %%xmm1, %%xmm1 \n\t" | |
846 "xorps %%xmm2, %%xmm2 \n\t" | |
847 "movl %0, %%esi \n\t" | |
3529 | 848 ".balign 16 \n\t" |
3508 | 849 "1: \n\t" |
850 "movlps (%%esi), %%xmm0 \n\t" //buf[p] | |
851 "movlps 8(%%esi), %%xmm1\n\t" //buf[q] | |
852 "movhps (%%esi), %%xmm0 \n\t" //buf[p] | |
853 "movhps 8(%%esi), %%xmm2\n\t" //buf[q] | |
854 "addps %%xmm1, %%xmm0 \n\t" | |
855 "subps %%xmm2, %%xmm0 \n\t" | |
856 "movaps %%xmm0, (%%esi) \n\t" | |
857 "addl $16, %%esi \n\t" | |
858 "cmpl %1, %%esi \n\t" | |
859 " jb 1b \n\t" | |
860 :: "g" (buf), "r" (buf + 128) | |
861 : "%esi" | |
862 ); | |
3549 | 863 |
3623 | 864 /* 2. iteration */ |
3512 | 865 // Note w[1]={{1,0}, {0,-1}} |
866 asm volatile( | |
4247
2dbd637ffe05
mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents:
3908
diff
changeset
|
867 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1 |
3512 | 868 "movl %0, %%esi \n\t" |
3529 | 869 ".balign 16 \n\t" |
3512 | 870 "1: \n\t" |
871 "movaps 16(%%esi), %%xmm2 \n\t" //r2,i2,r3,i3 | |
872 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3 | |
873 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3 | |
874 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 | |
875 "movaps (%%esi), %%xmm1 \n\t" //r0,i0,r1,i1 | |
876 "addps %%xmm2, %%xmm0 \n\t" | |
877 "subps %%xmm2, %%xmm1 \n\t" | |
878 "movaps %%xmm0, (%%esi) \n\t" | |
879 "movaps %%xmm1, 16(%%esi) \n\t" | |
880 "addl $32, %%esi \n\t" | |
881 "cmpl %1, %%esi \n\t" | |
882 " jb 1b \n\t" | |
883 :: "g" (buf), "r" (buf + 128) | |
884 : "%esi" | |
885 ); | |
3549 | 886 |
3623 | 887 /* 3. iteration */ |
3534 | 888 /* |
889 Note sseW2+0={1,1,sqrt(2),sqrt(2)) | |
890 Note sseW2+16={0,0,sqrt(2),-sqrt(2)) | |
891 Note sseW2+32={0,0,-sqrt(2),-sqrt(2)) | |
892 Note sseW2+48={1,-1,sqrt(2),-sqrt(2)) | |
893 */ | |
894 asm volatile( | |
4247
2dbd637ffe05
mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents:
3908
diff
changeset
|
895 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t" |
2dbd637ffe05
mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents:
3908
diff
changeset
|
896 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t" |
3534 | 897 "xorps %%xmm5, %%xmm5 \n\t" |
898 "xorps %%xmm2, %%xmm2 \n\t" | |
899 "movl %0, %%esi \n\t" | |
900 ".balign 16 \n\t" | |
901 "1: \n\t" | |
3537 | 902 "movaps 32(%%esi), %%xmm2 \n\t" //r4,i4,r5,i5 |
3534 | 903 "movaps 48(%%esi), %%xmm3 \n\t" //r6,i6,r7,i7 |
4247
2dbd637ffe05
mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents:
3908
diff
changeset
|
904 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5 |
2dbd637ffe05
mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents:
3908
diff
changeset
|
905 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7 |
3537 | 906 "mulps %%xmm2, %%xmm4 \n\t" |
907 "mulps %%xmm3, %%xmm5 \n\t" | |
3534 | 908 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5 |
909 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7 | |
3537 | 910 "mulps %%xmm6, %%xmm3 \n\t" |
3534 | 911 "mulps %%xmm7, %%xmm2 \n\t" |
912 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 | |
913 "movaps 16(%%esi), %%xmm1 \n\t" //r2,i2,r3,i3 | |
914 "addps %%xmm4, %%xmm2 \n\t" | |
915 "addps %%xmm5, %%xmm3 \n\t" | |
916 "movaps %%xmm2, %%xmm4 \n\t" | |
917 "movaps %%xmm3, %%xmm5 \n\t" | |
918 "addps %%xmm0, %%xmm2 \n\t" | |
919 "addps %%xmm1, %%xmm3 \n\t" | |
920 "subps %%xmm4, %%xmm0 \n\t" | |
921 "subps %%xmm5, %%xmm1 \n\t" | |
922 "movaps %%xmm2, (%%esi) \n\t" | |
923 "movaps %%xmm3, 16(%%esi) \n\t" | |
924 "movaps %%xmm0, 32(%%esi) \n\t" | |
925 "movaps %%xmm1, 48(%%esi) \n\t" | |
926 "addl $64, %%esi \n\t" | |
927 "cmpl %1, %%esi \n\t" | |
928 " jb 1b \n\t" | |
929 :: "g" (buf), "r" (buf + 128) | |
930 : "%esi" | |
931 ); | |
3508 | 932 |
3623 | 933 /* 4-7. iterations */ |
3546 | 934 for (m=3; m < 7; m++) { |
935 two_m = (1 << m); | |
936 two_m_plus_one = two_m<<1; | |
937 asm volatile( | |
938 "movl %0, %%esi \n\t" | |
939 ".balign 16 \n\t" | |
940 "1: \n\t" | |
941 "xorl %%edi, %%edi \n\t" // k | |
942 "leal (%%esi, %3), %%edx \n\t" | |
943 "2: \n\t" | |
944 "movaps (%%edx, %%edi), %%xmm1 \n\t" | |
945 "movaps (%4, %%edi, 2), %%xmm2 \n\t" | |
946 "mulps %%xmm1, %%xmm2 \n\t" | |
947 "shufps $0xB1, %%xmm1, %%xmm1 \n\t" | |
948 "mulps 16(%4, %%edi, 2), %%xmm1 \n\t" | |
949 "movaps (%%esi, %%edi), %%xmm0 \n\t" | |
950 "addps %%xmm2, %%xmm1 \n\t" | |
951 "movaps %%xmm1, %%xmm2 \n\t" | |
952 "addps %%xmm0, %%xmm1 \n\t" | |
953 "subps %%xmm2, %%xmm0 \n\t" | |
954 "movaps %%xmm1, (%%esi, %%edi) \n\t" | |
955 "movaps %%xmm0, (%%edx, %%edi) \n\t" | |
956 "addl $16, %%edi \n\t" | |
957 "cmpl %3, %%edi \n\t" //FIXME (opt) count against 0 | |
958 " jb 2b \n\t" | |
959 "addl %2, %%esi \n\t" | |
960 "cmpl %1, %%esi \n\t" | |
961 " jb 1b \n\t" | |
962 :: "g" (buf), "m" (buf+128), "m" (two_m_plus_one<<3), "r" (two_m<<3), | |
963 "r" (sseW[m]) | |
964 : "%esi", "%edi", "%edx" | |
965 ); | |
966 } | |
967 | |
3623 | 968 /* Post IFFT complex multiply plus IFFT complex conjugate*/ |
3581 | 969 asm volatile( |
4247
2dbd637ffe05
mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents:
3908
diff
changeset
|
970 "movl $-1024, %%esi \n\t" |
3581 | 971 ".balign 16 \n\t" |
972 "1: \n\t" | |
973 "movaps (%0, %%esi), %%xmm0 \n\t" | |
974 "movaps (%0, %%esi), %%xmm1 \n\t" | |
975 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" | |
4247
2dbd637ffe05
mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents:
3908
diff
changeset
|
976 "mulps 1024+"MANGLE(sseSinCos1c)"(%%esi), %%xmm1\n\t" |
2dbd637ffe05
mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents:
3908
diff
changeset
|
977 "mulps 1024+"MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t" |
3581 | 978 "addps %%xmm1, %%xmm0 \n\t" |
979 "movaps %%xmm0, (%0, %%esi) \n\t" | |
980 "addl $16, %%esi \n\t" | |
981 " jnz 1b \n\t" | |
982 :: "r" (buf+128) | |
983 : "%esi" | |
984 ); | |
985 | |
3394 | 986 |
987 data_ptr = data; | |
988 delay_ptr = delay; | |
989 window_ptr = imdct_window; | |
990 | |
991 /* Window and convert to real valued signal */ | |
3552 | 992 asm volatile( |
993 "xorl %%edi, %%edi \n\t" // 0 | |
994 "xorl %%esi, %%esi \n\t" // 0 | |
995 "movss %3, %%xmm2 \n\t" // bias | |
996 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... | |
997 ".balign 16 \n\t" | |
998 "1: \n\t" | |
999 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ? | |
1000 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ? | |
1001 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ? | |
1002 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ? | |
1003 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A | |
4247
2dbd637ffe05
mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents:
3908
diff
changeset
|
1004 "mulps "MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" |
3552 | 1005 "addps (%2, %%esi), %%xmm0 \n\t" |
1006 "addps %%xmm2, %%xmm0 \n\t" | |
1007 "movaps %%xmm0, (%1, %%esi) \n\t" | |
1008 "addl $16, %%esi \n\t" | |
1009 "subl $16, %%edi \n\t" | |
1010 "cmpl $512, %%esi \n\t" | |
1011 " jb 1b \n\t" | |
1012 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias) | |
1013 : "%esi", "%edi" | |
1014 ); | |
1015 data_ptr+=128; | |
1016 delay_ptr+=128; | |
3553 | 1017 // window_ptr+=128; |
3579 | 1018 |
3552 | 1019 asm volatile( |
1020 "movl $1024, %%edi \n\t" // 512 | |
1021 "xorl %%esi, %%esi \n\t" // 0 | |
1022 "movss %3, %%xmm2 \n\t" // bias | |
1023 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... | |
1024 ".balign 16 \n\t" | |
1025 "1: \n\t" | |
1026 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A | |
1027 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C | |
1028 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C | |
1029 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A | |
1030 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A | |
4247
2dbd637ffe05
mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents:
3908
diff
changeset
|
1031 "mulps 512+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" |
3552 | 1032 "addps (%2, %%esi), %%xmm0 \n\t" |
1033 "addps %%xmm2, %%xmm0 \n\t" | |
1034 "movaps %%xmm0, (%1, %%esi) \n\t" | |
1035 "addl $16, %%esi \n\t" | |
1036 "subl $16, %%edi \n\t" | |
1037 "cmpl $512, %%esi \n\t" | |
1038 " jb 1b \n\t" | |
1039 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias) | |
1040 : "%esi", "%edi" | |
1041 ); | |
1042 data_ptr+=128; | |
3553 | 1043 // window_ptr+=128; |
3394 | 1044 |
1045 /* The trailing edge of the window goes into the delay line */ | |
1046 delay_ptr = delay; | |
1047 | |
3553 | 1048 asm volatile( |
1049 "xorl %%edi, %%edi \n\t" // 0 | |
1050 "xorl %%esi, %%esi \n\t" // 0 | |
1051 ".balign 16 \n\t" | |
1052 "1: \n\t" | |
1053 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A | |
1054 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C | |
1055 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C | |
1056 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A | |
1057 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A | |
4247
2dbd637ffe05
mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents:
3908
diff
changeset
|
1058 "mulps 1024+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" |
3553 | 1059 "movaps %%xmm0, (%1, %%esi) \n\t" |
1060 "addl $16, %%esi \n\t" | |
1061 "subl $16, %%edi \n\t" | |
1062 "cmpl $512, %%esi \n\t" | |
1063 " jb 1b \n\t" | |
1064 :: "r" (buf+64), "r" (delay_ptr) | |
1065 : "%esi", "%edi" | |
1066 ); | |
1067 delay_ptr+=128; | |
1068 // window_ptr-=128; | |
3579 | 1069 |
3553 | 1070 asm volatile( |
1071 "movl $1024, %%edi \n\t" // 1024 | |
1072 "xorl %%esi, %%esi \n\t" // 0 | |
1073 ".balign 16 \n\t" | |
1074 "1: \n\t" | |
1075 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ? | |
1076 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ? | |
1077 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ? | |
1078 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ? | |
1079 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A | |
4247
2dbd637ffe05
mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents:
3908
diff
changeset
|
1080 "mulps 1536+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" |
3553 | 1081 "movaps %%xmm0, (%1, %%esi) \n\t" |
1082 "addl $16, %%esi \n\t" | |
1083 "subl $16, %%edi \n\t" | |
1084 "cmpl $512, %%esi \n\t" | |
1085 " jb 1b \n\t" | |
1086 :: "r" (buf), "r" (delay_ptr) | |
1087 : "%esi", "%edi" | |
1088 ); | |
3394 | 1089 } |
3579 | 1090 #endif //arch_x86 |
3394 | 1091 |
1092 void | |
1093 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias) | |
1094 { | |
1095 int i,k; | |
1096 int p,q; | |
1097 int m; | |
1098 int two_m; | |
1099 int two_m_plus_one; | |
1100 | |
1101 sample_t tmp_a_i; | |
1102 sample_t tmp_a_r; | |
1103 sample_t tmp_b_i; | |
1104 sample_t tmp_b_r; | |
1105 | |
1106 sample_t *data_ptr; | |
1107 sample_t *delay_ptr; | |
1108 sample_t *window_ptr; | |
1109 | |
1110 complex_t *buf_1, *buf_2; | |
1111 | |
1112 buf_1 = &buf[0]; | |
1113 buf_2 = &buf[64]; | |
1114 | |
1115 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ | |
1116 for(k=0; k<64; k++) { | |
1117 /* X1[k] = X[2*k] */ | |
1118 /* X2[k] = X[2*k+1] */ | |
1119 | |
1120 p = 2 * (128-2*k-1); | |
1121 q = 2 * (2 * k); | |
1122 | |
1123 /* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */ | |
1124 buf_1[k].real = data[p] * xcos2[k] - data[q] * xsin2[k]; | |
1125 buf_1[k].imag = -1.0f * (data[q] * xcos2[k] + data[p] * xsin2[k]); | |
1126 /* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */ | |
1127 buf_2[k].real = data[p + 1] * xcos2[k] - data[q + 1] * xsin2[k]; | |
1128 buf_2[k].imag = -1.0f * ( data[q + 1] * xcos2[k] + data[p + 1] * xsin2[k]); | |
1129 } | |
1130 | |
1131 /* IFFT Bit reversed shuffling */ | |
1132 for(i=0; i<64; i++) { | |
1133 k = bit_reverse_256[i]; | |
1134 if (k < i) { | |
1135 swap_cmplx(&buf_1[i],&buf_1[k]); | |
1136 swap_cmplx(&buf_2[i],&buf_2[k]); | |
1137 } | |
1138 } | |
1139 | |
1140 /* FFT Merge */ | |
1141 for (m=0; m < 6; m++) { | |
1142 two_m = (1 << m); | |
1143 two_m_plus_one = (1 << (m+1)); | |
1144 | |
1145 /* FIXME */ | |
1146 if(m) | |
1147 two_m = (1 << m); | |
1148 else | |
1149 two_m = 1; | |
1150 | |
1151 for(k = 0; k < two_m; k++) { | |
1152 for(i = 0; i < 64; i += two_m_plus_one) { | |
1153 p = k + i; | |
1154 q = p + two_m; | |
1155 /* Do block 1 */ | |
1156 tmp_a_r = buf_1[p].real; | |
1157 tmp_a_i = buf_1[p].imag; | |
1158 tmp_b_r = buf_1[q].real * w[m][k].real - buf_1[q].imag * w[m][k].imag; | |
1159 tmp_b_i = buf_1[q].imag * w[m][k].real + buf_1[q].real * w[m][k].imag; | |
1160 buf_1[p].real = tmp_a_r + tmp_b_r; | |
1161 buf_1[p].imag = tmp_a_i + tmp_b_i; | |
1162 buf_1[q].real = tmp_a_r - tmp_b_r; | |
1163 buf_1[q].imag = tmp_a_i - tmp_b_i; | |
1164 | |
1165 /* Do block 2 */ | |
1166 tmp_a_r = buf_2[p].real; | |
1167 tmp_a_i = buf_2[p].imag; | |
1168 tmp_b_r = buf_2[q].real * w[m][k].real - buf_2[q].imag * w[m][k].imag; | |
1169 tmp_b_i = buf_2[q].imag * w[m][k].real + buf_2[q].real * w[m][k].imag; | |
1170 buf_2[p].real = tmp_a_r + tmp_b_r; | |
1171 buf_2[p].imag = tmp_a_i + tmp_b_i; | |
1172 buf_2[q].real = tmp_a_r - tmp_b_r; | |
1173 buf_2[q].imag = tmp_a_i - tmp_b_i; | |
1174 } | |
1175 } | |
1176 } | |
1177 | |
1178 /* Post IFFT complex multiply */ | |
1179 for( i=0; i < 64; i++) { | |
1180 /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */ | |
1181 tmp_a_r = buf_1[i].real; | |
1182 tmp_a_i = -buf_1[i].imag; | |
1183 buf_1[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]); | |
1184 buf_1[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]); | |
1185 /* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */ | |
1186 tmp_a_r = buf_2[i].real; | |
1187 tmp_a_i = -buf_2[i].imag; | |
1188 buf_2[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]); | |
1189 buf_2[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]); | |
1190 } | |
1191 | |
1192 data_ptr = data; | |
1193 delay_ptr = delay; | |
1194 window_ptr = imdct_window; | |
1195 | |
1196 /* Window and convert to real valued signal */ | |
1197 for(i=0; i< 64; i++) { | |
1198 *data_ptr++ = -buf_1[i].imag * *window_ptr++ + *delay_ptr++ + bias; | |
1199 *data_ptr++ = buf_1[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; | |
1200 } | |
1201 | |
1202 for(i=0; i< 64; i++) { | |
1203 *data_ptr++ = -buf_1[i].real * *window_ptr++ + *delay_ptr++ + bias; | |
1204 *data_ptr++ = buf_1[64-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; | |
1205 } | |
1206 | |
1207 delay_ptr = delay; | |
1208 | |
1209 for(i=0; i< 64; i++) { | |
1210 *delay_ptr++ = -buf_2[i].real * *--window_ptr; | |
1211 *delay_ptr++ = buf_2[64-i-1].imag * *--window_ptr; | |
1212 } | |
1213 | |
1214 for(i=0; i< 64; i++) { | |
1215 *delay_ptr++ = buf_2[i].imag * *--window_ptr; | |
1216 *delay_ptr++ = -buf_2[64-i-1].real * *--window_ptr; | |
1217 } | |
1218 } | |
1219 | |
1220 void imdct_init (uint32_t mm_accel) | |
1221 { | |
1222 #ifdef LIBA52_MLIB | |
1223 if (mm_accel & MM_ACCEL_MLIB) { | |
1224 fprintf (stderr, "Using mlib for IMDCT transform\n"); | |
1225 imdct_512 = imdct_do_512_mlib; | |
1226 imdct_256 = imdct_do_256_mlib; | |
1227 } else | |
1228 #endif | |
1229 { | |
1230 int i, j, k; | |
1231 | |
1232 /* Twiddle factors to turn IFFT into IMDCT */ | |
1233 for (i = 0; i < 128; i++) { | |
1234 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); | |
1235 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1)); | |
1236 } | |
3579 | 1237 #ifdef ARCH_X86 |
3527 | 1238 for (i = 0; i < 128; i++) { |
3581 | 1239 sseSinCos1c[2*i+0]= xcos1[i]; |
1240 sseSinCos1c[2*i+1]= -xcos1[i]; | |
1241 sseSinCos1d[2*i+0]= xsin1[i]; | |
1242 sseSinCos1d[2*i+1]= xsin1[i]; | |
3527 | 1243 } |
1244 #endif | |
3394 | 1245 |
1246 /* More twiddle factors to turn IFFT into IMDCT */ | |
1247 for (i = 0; i < 64; i++) { | |
1248 xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1)); | |
1249 xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1)); | |
1250 } | |
1251 | |
1252 for (i = 0; i < 7; i++) { | |
1253 j = 1 << i; | |
1254 for (k = 0; k < j; k++) { | |
1255 w[i][k].real = cos (-M_PI * k / j); | |
1256 w[i][k].imag = sin (-M_PI * k / j); | |
1257 } | |
1258 } | |
3579 | 1259 #ifdef ARCH_X86 |
3534 | 1260 for (i = 1; i < 7; i++) { |
1261 j = 1 << i; | |
1262 for (k = 0; k < j; k+=2) { | |
1263 | |
1264 sseW[i][4*k + 0] = w[i][k+0].real; | |
1265 sseW[i][4*k + 1] = w[i][k+0].real; | |
1266 sseW[i][4*k + 2] = w[i][k+1].real; | |
1267 sseW[i][4*k + 3] = w[i][k+1].real; | |
1268 | |
1269 sseW[i][4*k + 4] = -w[i][k+0].imag; | |
1270 sseW[i][4*k + 5] = w[i][k+0].imag; | |
1271 sseW[i][4*k + 6] = -w[i][k+1].imag; | |
1272 sseW[i][4*k + 7] = w[i][k+1].imag; | |
1273 | |
1274 //we multiply more or less uninitalized numbers so we need to use exactly 0.0 | |
1275 if(k==0) | |
1276 { | |
1277 // sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0; | |
1278 sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0; | |
1279 } | |
1280 | |
1281 if(2*k == j) | |
1282 { | |
1283 sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0; | |
1284 // sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0); | |
1285 } | |
1286 } | |
1287 } | |
3552 | 1288 |
1289 for(i=0; i<128; i++) | |
1290 { | |
1291 sseWindow[2*i+0]= -imdct_window[2*i+0]; | |
3553 | 1292 sseWindow[2*i+1]= imdct_window[2*i+1]; |
3552 | 1293 } |
3553 | 1294 |
1295 for(i=0; i<64; i++) | |
1296 { | |
1297 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1]; | |
1298 sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0]; | |
1299 sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1]; | |
1300 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0]; | |
1301 } | |
3579 | 1302 #endif // arch_x86 |
1303 | |
3720
120ac80f13c2
Fixed #ifdef discrepancy that was breaking compilation on PPC platform
melanson
parents:
3623
diff
changeset
|
1304 imdct_512 = imdct_do_512; |
120ac80f13c2
Fixed #ifdef discrepancy that was breaking compilation on PPC platform
melanson
parents:
3623
diff
changeset
|
1305 #ifdef ARCH_X86 |
4497 | 1306 if(mm_accel & MM_ACCEL_X86_SSE) |
1307 { | |
1308 fprintf (stderr, "Using SSE optimized IMDCT transform\n"); | |
1309 imdct_512 = imdct_do_512_sse; | |
1310 } | |
1311 else | |
1312 if(mm_accel & MM_ACCEL_X86_3DNOWEXT) | |
1313 { | |
1314 fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n"); | |
1315 imdct_512 = imdct_do_512_3dnowex; | |
1316 } | |
1317 else | |
1318 if(mm_accel & MM_ACCEL_X86_3DNOW) | |
1319 { | |
1320 fprintf (stderr, "Using 3DNow optimized IMDCT transform\n"); | |
1321 imdct_512 = imdct_do_512_3dnow; | |
1322 } | |
1323 else | |
3720
120ac80f13c2
Fixed #ifdef discrepancy that was breaking compilation on PPC platform
melanson
parents:
3623
diff
changeset
|
1324 #endif // arch_x86 |
9001
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
1325 #ifdef HAVE_ALTIVEC |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
1326 if (mm_accel & MM_ACCEL_PPC_ALTIVEC) |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
1327 { |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
1328 fprintf(stderr, "Using AltiVec optimized IMDCT transform\n"); |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
1329 imdct_512 = imdct_do_512_altivec; |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
1330 } |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
1331 else |
01a9cf43074c
An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents:
8451
diff
changeset
|
1332 #endif |
4497 | 1333 fprintf (stderr, "No accelerated IMDCT transform found\n"); |
3394 | 1334 imdct_256 = imdct_do_256; |
1335 } | |
1336 } | |
3884 | 1337 |
1338 static void fft_asmb(int k, complex_t *x, complex_t *wTB, | |
1339 const complex_t *d, const complex_t *d_3) | |
1340 { | |
1341 register complex_t *x2k, *x3k, *x4k, *wB; | |
1342 register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i; | |
1343 | |
1344 x2k = x + 2 * k; | |
1345 x3k = x2k + 2 * k; | |
1346 x4k = x3k + 2 * k; | |
1347 wB = wTB + 2 * k; | |
1348 | |
1349 TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]); | |
1350 TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]); | |
1351 | |
1352 --k; | |
1353 for(;;) { | |
1354 TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]); | |
1355 TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]); | |
1356 if (!--k) break; | |
1357 x += 2; | |
1358 x2k += 2; | |
1359 x3k += 2; | |
1360 x4k += 2; | |
1361 d += 2; | |
1362 d_3 += 2; | |
1363 wTB += 2; | |
1364 wB += 2; | |
1365 } | |
1366 | |
1367 } | |
1368 | |
1369 static void fft_asmb16(complex_t *x, complex_t *wTB) | |
1370 { | |
1371 register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i; | |
1372 int k = 2; | |
1373 | |
1374 /* transform x[0], x[8], x[4], x[12] */ | |
1375 TRANSZERO(x[0],x[4],x[8],x[12]); | |
1376 | |
1377 /* transform x[1], x[9], x[5], x[13] */ | |
1378 TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]); | |
1379 | |
1380 /* transform x[2], x[10], x[6], x[14] */ | |
1381 TRANSHALF_16(x[2],x[6],x[10],x[14]); | |
1382 | |
1383 /* transform x[3], x[11], x[7], x[15] */ | |
1384 TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]); | |
1385 | |
1386 } | |
1387 | |
1388 static void fft_4(complex_t *x) | |
1389 { | |
1390 /* delta_p = 1 here */ | |
1391 /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4} | |
1392 */ | |
1393 | |
1394 register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i; | |
1395 | |
1396 yt_r = x[0].real; | |
1397 yb_r = yt_r - x[2].real; | |
1398 yt_r += x[2].real; | |
1399 | |
1400 u_r = x[1].real; | |
1401 vi_i = x[3].real - u_r; | |
1402 u_r += x[3].real; | |
1403 | |
1404 u_i = x[1].imag; | |
1405 vi_r = u_i - x[3].imag; | |
1406 u_i += x[3].imag; | |
1407 | |
1408 yt_i = yt_r; | |
1409 yt_i += u_r; | |
1410 x[0].real = yt_i; | |
1411 yt_r -= u_r; | |
1412 x[2].real = yt_r; | |
1413 yt_i = yb_r; | |
1414 yt_i += vi_r; | |
1415 x[1].real = yt_i; | |
1416 yb_r -= vi_r; | |
1417 x[3].real = yb_r; | |
1418 | |
1419 yt_i = x[0].imag; | |
1420 yb_i = yt_i - x[2].imag; | |
1421 yt_i += x[2].imag; | |
1422 | |
1423 yt_r = yt_i; | |
1424 yt_r += u_i; | |
1425 x[0].imag = yt_r; | |
1426 yt_i -= u_i; | |
1427 x[2].imag = yt_i; | |
1428 yt_r = yb_i; | |
1429 yt_r += vi_i; | |
1430 x[1].imag = yt_r; | |
1431 yb_i -= vi_i; | |
1432 x[3].imag = yb_i; | |
1433 } | |
1434 | |
1435 | |
1436 static void fft_8(complex_t *x) | |
1437 { | |
1438 /* delta_p = diag{1, sqrt(i)} here */ | |
1439 /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8} | |
1440 */ | |
1441 register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i; | |
1442 | |
1443 wT1_r = x[1].real; | |
1444 wT1_i = x[1].imag; | |
1445 wB1_r = x[3].real; | |
1446 wB1_i = x[3].imag; | |
1447 | |
1448 x[1] = x[2]; | |
1449 x[2] = x[4]; | |
1450 x[3] = x[6]; | |
1451 fft_4(&x[0]); | |
1452 | |
1453 | |
1454 /* x[0] x[4] */ | |
1455 wT2_r = x[5].real; | |
1456 wT2_r += x[7].real; | |
1457 wT2_r += wT1_r; | |
1458 wT2_r += wB1_r; | |
1459 wT2_i = wT2_r; | |
1460 wT2_r += x[0].real; | |
1461 wT2_i = x[0].real - wT2_i; | |
1462 x[0].real = wT2_r; | |
1463 x[4].real = wT2_i; | |
1464 | |
1465 wT2_i = x[5].imag; | |
1466 wT2_i += x[7].imag; | |
1467 wT2_i += wT1_i; | |
1468 wT2_i += wB1_i; | |
1469 wT2_r = wT2_i; | |
1470 wT2_r += x[0].imag; | |
1471 wT2_i = x[0].imag - wT2_i; | |
1472 x[0].imag = wT2_r; | |
1473 x[4].imag = wT2_i; | |
1474 | |
1475 /* x[2] x[6] */ | |
1476 wT2_r = x[5].imag; | |
1477 wT2_r -= x[7].imag; | |
1478 wT2_r += wT1_i; | |
1479 wT2_r -= wB1_i; | |
1480 wT2_i = wT2_r; | |
1481 wT2_r += x[2].real; | |
1482 wT2_i = x[2].real - wT2_i; | |
1483 x[2].real = wT2_r; | |
1484 x[6].real = wT2_i; | |
1485 | |
1486 wT2_i = x[5].real; | |
1487 wT2_i -= x[7].real; | |
1488 wT2_i += wT1_r; | |
1489 wT2_i -= wB1_r; | |
1490 wT2_r = wT2_i; | |
1491 wT2_r += x[2].imag; | |
1492 wT2_i = x[2].imag - wT2_i; | |
1493 x[2].imag = wT2_i; | |
1494 x[6].imag = wT2_r; | |
1495 | |
1496 | |
1497 /* x[1] x[5] */ | |
1498 wT2_r = wT1_r; | |
1499 wT2_r += wB1_i; | |
1500 wT2_r -= x[5].real; | |
1501 wT2_r -= x[7].imag; | |
1502 wT2_i = wT1_i; | |
1503 wT2_i -= wB1_r; | |
1504 wT2_i -= x[5].imag; | |
1505 wT2_i += x[7].real; | |
1506 | |
1507 wB2_r = wT2_r; | |
1508 wB2_r += wT2_i; | |
1509 wT2_i -= wT2_r; | |
1510 wB2_r *= HSQRT2; | |
1511 wT2_i *= HSQRT2; | |
1512 wT2_r = wB2_r; | |
1513 wB2_r += x[1].real; | |
1514 wT2_r = x[1].real - wT2_r; | |
1515 | |
1516 wB2_i = x[5].real; | |
1517 x[1].real = wB2_r; | |
1518 x[5].real = wT2_r; | |
1519 | |
1520 wT2_r = wT2_i; | |
1521 wT2_r += x[1].imag; | |
1522 wT2_i = x[1].imag - wT2_i; | |
1523 wB2_r = x[5].imag; | |
1524 x[1].imag = wT2_r; | |
1525 x[5].imag = wT2_i; | |
1526 | |
1527 /* x[3] x[7] */ | |
1528 wT1_r -= wB1_i; | |
1529 wT1_i += wB1_r; | |
1530 wB1_r = wB2_i - x[7].imag; | |
1531 wB1_i = wB2_r + x[7].real; | |
1532 wT1_r -= wB1_r; | |
1533 wT1_i -= wB1_i; | |
1534 wB1_r = wT1_r + wT1_i; | |
1535 wB1_r *= HSQRT2; | |
1536 wT1_i -= wT1_r; | |
1537 wT1_i *= HSQRT2; | |
1538 wB2_r = x[3].real; | |
1539 wB2_i = wB2_r + wT1_i; | |
1540 wB2_r -= wT1_i; | |
1541 x[3].real = wB2_i; | |
1542 x[7].real = wB2_r; | |
1543 wB2_i = x[3].imag; | |
1544 wB2_r = wB2_i + wB1_r; | |
1545 wB2_i -= wB1_r; | |
1546 x[3].imag = wB2_i; | |
1547 x[7].imag = wB2_r; | |
1548 } | |
1549 | |
1550 | |
1551 static void fft_128p(complex_t *a) | |
1552 { | |
1553 fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]); | |
1554 fft_asmb16(&a[0], &a[8]); | |
1555 | |
1556 fft_8(&a[16]), fft_8(&a[24]); | |
1557 fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]); | |
1558 | |
1559 fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]); | |
1560 fft_asmb16(&a[32], &a[40]); | |
1561 | |
1562 fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]); | |
1563 fft_asmb16(&a[48], &a[56]); | |
1564 | |
1565 fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]); | |
1566 | |
1567 fft_8(&a[64]); fft_4(&a[72]); fft_4(&a[76]); | |
1568 /* fft_16(&a[64]); */ | |
1569 fft_asmb16(&a[64], &a[72]); | |
1570 | |
1571 fft_8(&a[80]); fft_8(&a[88]); | |
1572 | |
1573 /* fft_32(&a[64]); */ | |
1574 fft_asmb(4, &a[64], &a[80],&delta32[0], &delta32_3[0]); | |
1575 | |
1576 fft_8(&a[96]); fft_4(&a[104]), fft_4(&a[108]); | |
1577 /* fft_16(&a[96]); */ | |
1578 fft_asmb16(&a[96], &a[104]); | |
1579 | |
1580 fft_8(&a[112]), fft_8(&a[120]); | |
1581 /* fft_32(&a[96]); */ | |
1582 fft_asmb(4, &a[96], &a[112], &delta32[0], &delta32_3[0]); | |
1583 | |
1584 /* fft_128(&a[0]); */ | |
1585 fft_asmb(16, &a[0], &a[64], &delta128[0], &delta128_3[0]); | |
1586 } | |
1587 | |
1588 | |
1589 |