Mercurial > mplayer.hg
annotate mp3lib/dct36.c @ 31685:31b6397e3b28
Another try at fixing swscale on win64, as per r31153.
Don't change paramater passing, but instead use casts.
Shouldn't affect asm output on anything other than win64.
libswscale should work on win64 now.
The rest of ffmpeg still isn't win64 compatible due to the issue of xmm
clobbers, but swscale doesn't use any SSE.
Patch by Anton Mitrofanov <BugMaster AT narod DOT ru>.
author | darkshikari |
---|---|
date | Sun, 18 Jul 2010 21:39:57 +0000 |
parents | 0ad2da052b2e |
children |
rev | line source |
---|---|
15167
07e7a572bd84
Mark modified imported files as such to comply with (L)GPL ¡ø2a.
diego
parents:
1245
diff
changeset
|
1 /* |
18783 | 2 * Modified for use with MPlayer, for details see the changelog at |
3 * http://svn.mplayerhq.hu/mplayer/trunk/ | |
15167
07e7a572bd84
Mark modified imported files as such to comply with (L)GPL ¡ø2a.
diego
parents:
1245
diff
changeset
|
4 * $Id$ |
07e7a572bd84
Mark modified imported files as such to comply with (L)GPL ¡ø2a.
diego
parents:
1245
diff
changeset
|
5 */ |
07e7a572bd84
Mark modified imported files as such to comply with (L)GPL ¡ø2a.
diego
parents:
1245
diff
changeset
|
6 |
29263
0f1b5b68af32
whitespace cosmetics: Remove all trailing whitespace.
diego
parents:
18783
diff
changeset
|
7 /* |
1 | 8 // This is an optimized DCT from Jeff Tsay's maplay 1.2+ package. |
9 // Saved one multiplication by doing the 'twiddle factor' stuff | |
10 // together with the window mul. (MH) | |
11 // | |
12 // This uses Byeong Gi Lee's Fast Cosine Transform algorithm, but the | |
13 // 9 point IDCT needs to be reduced further. Unfortunately, I don't | |
14 // know how to do that, because 9 is not an even number. - Jeff. | |
15 // | |
16 ////////////////////////////////////////////////////////////////// | |
17 // | |
18 // 9 Point Inverse Discrete Cosine Transform | |
19 // | |
20 // This piece of code is Copyright 1997 Mikko Tommila and is freely usable | |
21 // by anybody. The algorithm itself is of course in the public domain. | |
22 // | |
23 // Again derived heuristically from the 9-point WFTA. | |
24 // | |
25 // The algorithm is optimized (?) for speed, not for small rounding errors or | |
26 // good readability. | |
27 // | |
28 // 36 additions, 11 multiplications | |
29 // | |
30 // Again this is very likely sub-optimal. | |
31 // | |
32 // The code is optimized to use a minimum number of temporary variables, | |
33 // so it should compile quite well even on 8-register Intel x86 processors. | |
34 // This makes the code quite obfuscated and very difficult to understand. | |
35 // | |
36 // References: | |
37 // [1] S. Winograd: "On Computing the Discrete Fourier Transform", | |
38 // Mathematics of Computation, Volume 32, Number 141, January 1978, | |
39 // Pages 175-199 | |
40 */ | |
41 | |
42 /*------------------------------------------------------------------*/ | |
43 /* */ | |
44 /* Function: Calculation of the inverse MDCT */ | |
45 /* */ | |
46 /*------------------------------------------------------------------*/ | |
47 | |
48 static void dct36(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf) | |
49 { | |
50 #ifdef NEW_DCT9 | |
51 real tmp[18]; | |
52 #endif | |
53 | |
54 { | |
55 register real *in = inbuf; | |
56 | |
57 in[17]+=in[16]; in[16]+=in[15]; in[15]+=in[14]; | |
58 in[14]+=in[13]; in[13]+=in[12]; in[12]+=in[11]; | |
59 in[11]+=in[10]; in[10]+=in[9]; in[9] +=in[8]; | |
60 in[8] +=in[7]; in[7] +=in[6]; in[6] +=in[5]; | |
61 in[5] +=in[4]; in[4] +=in[3]; in[3] +=in[2]; | |
62 in[2] +=in[1]; in[1] +=in[0]; | |
63 | |
64 in[17]+=in[15]; in[15]+=in[13]; in[13]+=in[11]; in[11]+=in[9]; | |
65 in[9] +=in[7]; in[7] +=in[5]; in[5] +=in[3]; in[3] +=in[1]; | |
66 | |
67 | |
68 #ifdef NEW_DCT9 | |
69 { | |
70 real t0, t1, t2, t3, t4, t5, t6, t7; | |
71 | |
72 t1 = COS6_2 * in[12]; | |
73 t2 = COS6_2 * (in[8] + in[16] - in[4]); | |
74 | |
75 t3 = in[0] + t1; | |
76 t4 = in[0] - t1 - t1; | |
77 t5 = t4 - t2; | |
78 | |
79 t0 = cos9[0] * (in[4] + in[8]); | |
80 t1 = cos9[1] * (in[8] - in[16]); | |
81 | |
82 tmp[4] = t4 + t2 + t2; | |
83 t2 = cos9[2] * (in[4] + in[16]); | |
84 | |
85 t6 = t3 - t0 - t2; | |
86 t0 += t3 + t1; | |
87 t3 += t2 - t1; | |
88 | |
89 t2 = cos18[0] * (in[2] + in[10]); | |
90 t4 = cos18[1] * (in[10] - in[14]); | |
91 t7 = COS6_1 * in[6]; | |
92 | |
93 t1 = t2 + t4 + t7; | |
94 tmp[0] = t0 + t1; | |
95 tmp[8] = t0 - t1; | |
96 t1 = cos18[2] * (in[2] + in[14]); | |
97 t2 += t1 - t7; | |
98 | |
99 tmp[3] = t3 + t2; | |
100 t0 = COS6_1 * (in[10] + in[14] - in[2]); | |
101 tmp[5] = t3 - t2; | |
102 | |
103 t4 -= t1 + t7; | |
104 | |
105 tmp[1] = t5 - t0; | |
106 tmp[7] = t5 + t0; | |
107 tmp[2] = t6 + t4; | |
108 tmp[6] = t6 - t4; | |
109 } | |
110 | |
111 { | |
112 real t0, t1, t2, t3, t4, t5, t6, t7; | |
113 | |
114 t1 = COS6_2 * in[13]; | |
115 t2 = COS6_2 * (in[9] + in[17] - in[5]); | |
116 | |
117 t3 = in[1] + t1; | |
118 t4 = in[1] - t1 - t1; | |
119 t5 = t4 - t2; | |
120 | |
121 t0 = cos9[0] * (in[5] + in[9]); | |
122 t1 = cos9[1] * (in[9] - in[17]); | |
123 | |
124 tmp[13] = (t4 + t2 + t2) * tfcos36[17-13]; | |
125 t2 = cos9[2] * (in[5] + in[17]); | |
126 | |
127 t6 = t3 - t0 - t2; | |
128 t0 += t3 + t1; | |
129 t3 += t2 - t1; | |
130 | |
131 t2 = cos18[0] * (in[3] + in[11]); | |
132 t4 = cos18[1] * (in[11] - in[15]); | |
133 t7 = COS6_1 * in[7]; | |
134 | |
135 t1 = t2 + t4 + t7; | |
136 tmp[17] = (t0 + t1) * tfcos36[17-17]; | |
137 tmp[9] = (t0 - t1) * tfcos36[17-9]; | |
138 t1 = cos18[2] * (in[3] + in[15]); | |
139 t2 += t1 - t7; | |
140 | |
141 tmp[14] = (t3 + t2) * tfcos36[17-14]; | |
142 t0 = COS6_1 * (in[11] + in[15] - in[3]); | |
143 tmp[12] = (t3 - t2) * tfcos36[17-12]; | |
144 | |
145 t4 -= t1 + t7; | |
146 | |
147 tmp[16] = (t5 - t0) * tfcos36[17-16]; | |
148 tmp[10] = (t5 + t0) * tfcos36[17-10]; | |
149 tmp[15] = (t6 + t4) * tfcos36[17-15]; | |
150 tmp[11] = (t6 - t4) * tfcos36[17-11]; | |
151 } | |
152 | |
153 #define MACRO(v) { \ | |
154 real tmpval; \ | |
155 real sum0 = tmp[(v)]; \ | |
156 real sum1 = tmp[17-(v)]; \ | |
157 out2[9+(v)] = (tmpval = sum0 + sum1) * w[27+(v)]; \ | |
158 out2[8-(v)] = tmpval * w[26-(v)]; \ | |
159 sum0 -= sum1; \ | |
160 ts[SBLIMIT*(8-(v))] = out1[8-(v)] + sum0 * w[8-(v)]; \ | |
161 ts[SBLIMIT*(9+(v))] = out1[9+(v)] + sum0 * w[9+(v)]; } | |
162 | |
163 { | |
164 register real *out2 = o2; | |
165 register real *w = wintab; | |
166 register real *out1 = o1; | |
167 register real *ts = tsbuf; | |
168 | |
169 MACRO(0); | |
170 MACRO(1); | |
171 MACRO(2); | |
172 MACRO(3); | |
173 MACRO(4); | |
174 MACRO(5); | |
175 MACRO(6); | |
176 MACRO(7); | |
177 MACRO(8); | |
178 } | |
179 | |
180 #else | |
181 | |
182 { | |
183 | |
184 #define MACRO0(v) { \ | |
185 real tmp; \ | |
186 out2[9+(v)] = (tmp = sum0 + sum1) * w[27+(v)]; \ | |
187 out2[8-(v)] = tmp * w[26-(v)]; } \ | |
188 sum0 -= sum1; \ | |
189 ts[SBLIMIT*(8-(v))] = out1[8-(v)] + sum0 * w[8-(v)]; \ | |
29263
0f1b5b68af32
whitespace cosmetics: Remove all trailing whitespace.
diego
parents:
18783
diff
changeset
|
190 ts[SBLIMIT*(9+(v))] = out1[9+(v)] + sum0 * w[9+(v)]; |
1 | 191 #define MACRO1(v) { \ |
30990 | 192 real sum0, sum1; \ |
1 | 193 sum0 = tmp1a + tmp2a; \ |
30990 | 194 sum1 = (tmp1b + tmp2b) * tfcos36[(v)]; \ |
195 MACRO0(v); } | |
1 | 196 #define MACRO2(v) { \ |
30990 | 197 real sum0, sum1; \ |
1 | 198 sum0 = tmp2a - tmp1a; \ |
199 sum1 = (tmp2b - tmp1b) * tfcos36[(v)]; \ | |
30990 | 200 MACRO0(v); } |
1 | 201 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
202 register const real *c = COS9; |
1 | 203 register real *out2 = o2; |
30990 | 204 register real *w = wintab; |
205 register real *out1 = o1; | |
206 register real *ts = tsbuf; | |
1 | 207 |
208 real ta33,ta66,tb33,tb66; | |
209 | |
210 ta33 = in[2*3+0] * c[3]; | |
211 ta66 = in[2*6+0] * c[6]; | |
212 tb33 = in[2*3+1] * c[3]; | |
213 tb66 = in[2*6+1] * c[6]; | |
214 | |
29263
0f1b5b68af32
whitespace cosmetics: Remove all trailing whitespace.
diego
parents:
18783
diff
changeset
|
215 { |
1 | 216 real tmp1a,tmp2a,tmp1b,tmp2b; |
217 tmp1a = in[2*1+0] * c[1] + ta33 + in[2*5+0] * c[5] + in[2*7+0] * c[7]; | |
218 tmp1b = in[2*1+1] * c[1] + tb33 + in[2*5+1] * c[5] + in[2*7+1] * c[7]; | |
219 tmp2a = in[2*0+0] + in[2*2+0] * c[2] + in[2*4+0] * c[4] + ta66 + in[2*8+0] * c[8]; | |
220 tmp2b = in[2*0+1] + in[2*2+1] * c[2] + in[2*4+1] * c[4] + tb66 + in[2*8+1] * c[8]; | |
221 | |
222 MACRO1(0); | |
223 MACRO2(8); | |
224 } | |
225 | |
226 { | |
227 real tmp1a,tmp2a,tmp1b,tmp2b; | |
228 tmp1a = ( in[2*1+0] - in[2*5+0] - in[2*7+0] ) * c[3]; | |
229 tmp1b = ( in[2*1+1] - in[2*5+1] - in[2*7+1] ) * c[3]; | |
230 tmp2a = ( in[2*2+0] - in[2*4+0] - in[2*8+0] ) * c[6] - in[2*6+0] + in[2*0+0]; | |
231 tmp2b = ( in[2*2+1] - in[2*4+1] - in[2*8+1] ) * c[6] - in[2*6+1] + in[2*0+1]; | |
232 | |
233 MACRO1(1); | |
234 MACRO2(7); | |
235 } | |
236 | |
237 { | |
238 real tmp1a,tmp2a,tmp1b,tmp2b; | |
239 tmp1a = in[2*1+0] * c[5] - ta33 - in[2*5+0] * c[7] + in[2*7+0] * c[1]; | |
240 tmp1b = in[2*1+1] * c[5] - tb33 - in[2*5+1] * c[7] + in[2*7+1] * c[1]; | |
241 tmp2a = in[2*0+0] - in[2*2+0] * c[8] - in[2*4+0] * c[2] + ta66 + in[2*8+0] * c[4]; | |
242 tmp2b = in[2*0+1] - in[2*2+1] * c[8] - in[2*4+1] * c[2] + tb66 + in[2*8+1] * c[4]; | |
243 | |
244 MACRO1(2); | |
245 MACRO2(6); | |
246 } | |
247 | |
248 { | |
249 real tmp1a,tmp2a,tmp1b,tmp2b; | |
250 tmp1a = in[2*1+0] * c[7] - ta33 + in[2*5+0] * c[1] - in[2*7+0] * c[5]; | |
251 tmp1b = in[2*1+1] * c[7] - tb33 + in[2*5+1] * c[1] - in[2*7+1] * c[5]; | |
252 tmp2a = in[2*0+0] - in[2*2+0] * c[4] + in[2*4+0] * c[8] + ta66 - in[2*8+0] * c[2]; | |
253 tmp2b = in[2*0+1] - in[2*2+1] * c[4] + in[2*4+1] * c[8] + tb66 - in[2*8+1] * c[2]; | |
254 | |
255 MACRO1(3); | |
256 MACRO2(5); | |
257 } | |
258 | |
30990 | 259 { |
260 real sum0,sum1; | |
261 sum0 = in[2*0+0] - in[2*2+0] + in[2*4+0] - in[2*6+0] + in[2*8+0]; | |
262 sum1 = (in[2*0+1] - in[2*2+1] + in[2*4+1] - in[2*6+1] + in[2*8+1] ) * tfcos36[4]; | |
263 MACRO0(4); | |
264 } | |
1 | 265 } |
266 #endif | |
267 | |
268 } | |
269 } |