Mercurial > mplayer.hg
annotate mp3lib/dct36.c @ 2316:bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
added half uv interpolation support
added prefetch
BGR15 support in MMX (untested) (so BGR15,16,24,32 are supported)
special unscaled height version (not much faster but it doesnt interpolate uv vertically)
author | michael |
---|---|
date | Sat, 20 Oct 2001 21:12:09 +0000 |
parents | 03b7e2955a20 |
children | 07e7a572bd84 |
rev | line source |
---|---|
1 | 1 /* |
2 // This is an optimized DCT from Jeff Tsay's maplay 1.2+ package. | |
3 // Saved one multiplication by doing the 'twiddle factor' stuff | |
4 // together with the window mul. (MH) | |
5 // | |
6 // This uses Byeong Gi Lee's Fast Cosine Transform algorithm, but the | |
7 // 9 point IDCT needs to be reduced further. Unfortunately, I don't | |
8 // know how to do that, because 9 is not an even number. - Jeff. | |
9 // | |
10 ////////////////////////////////////////////////////////////////// | |
11 // | |
12 // 9 Point Inverse Discrete Cosine Transform | |
13 // | |
14 // This piece of code is Copyright 1997 Mikko Tommila and is freely usable | |
15 // by anybody. The algorithm itself is of course in the public domain. | |
16 // | |
17 // Again derived heuristically from the 9-point WFTA. | |
18 // | |
19 // The algorithm is optimized (?) for speed, not for small rounding errors or | |
20 // good readability. | |
21 // | |
22 // 36 additions, 11 multiplications | |
23 // | |
24 // Again this is very likely sub-optimal. | |
25 // | |
26 // The code is optimized to use a minimum number of temporary variables, | |
27 // so it should compile quite well even on 8-register Intel x86 processors. | |
28 // This makes the code quite obfuscated and very difficult to understand. | |
29 // | |
30 // References: | |
31 // [1] S. Winograd: "On Computing the Discrete Fourier Transform", | |
32 // Mathematics of Computation, Volume 32, Number 141, January 1978, | |
33 // Pages 175-199 | |
34 */ | |
35 | |
36 /*------------------------------------------------------------------*/ | |
37 /* */ | |
38 /* Function: Calculation of the inverse MDCT */ | |
39 /* */ | |
40 /*------------------------------------------------------------------*/ | |
41 | |
42 static void dct36(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf) | |
43 { | |
44 #ifdef NEW_DCT9 | |
45 real tmp[18]; | |
46 #endif | |
47 | |
48 { | |
49 register real *in = inbuf; | |
50 | |
51 in[17]+=in[16]; in[16]+=in[15]; in[15]+=in[14]; | |
52 in[14]+=in[13]; in[13]+=in[12]; in[12]+=in[11]; | |
53 in[11]+=in[10]; in[10]+=in[9]; in[9] +=in[8]; | |
54 in[8] +=in[7]; in[7] +=in[6]; in[6] +=in[5]; | |
55 in[5] +=in[4]; in[4] +=in[3]; in[3] +=in[2]; | |
56 in[2] +=in[1]; in[1] +=in[0]; | |
57 | |
58 in[17]+=in[15]; in[15]+=in[13]; in[13]+=in[11]; in[11]+=in[9]; | |
59 in[9] +=in[7]; in[7] +=in[5]; in[5] +=in[3]; in[3] +=in[1]; | |
60 | |
61 | |
62 #ifdef NEW_DCT9 | |
63 { | |
64 real t0, t1, t2, t3, t4, t5, t6, t7; | |
65 | |
66 t1 = COS6_2 * in[12]; | |
67 t2 = COS6_2 * (in[8] + in[16] - in[4]); | |
68 | |
69 t3 = in[0] + t1; | |
70 t4 = in[0] - t1 - t1; | |
71 t5 = t4 - t2; | |
72 | |
73 t0 = cos9[0] * (in[4] + in[8]); | |
74 t1 = cos9[1] * (in[8] - in[16]); | |
75 | |
76 tmp[4] = t4 + t2 + t2; | |
77 t2 = cos9[2] * (in[4] + in[16]); | |
78 | |
79 t6 = t3 - t0 - t2; | |
80 t0 += t3 + t1; | |
81 t3 += t2 - t1; | |
82 | |
83 t2 = cos18[0] * (in[2] + in[10]); | |
84 t4 = cos18[1] * (in[10] - in[14]); | |
85 t7 = COS6_1 * in[6]; | |
86 | |
87 t1 = t2 + t4 + t7; | |
88 tmp[0] = t0 + t1; | |
89 tmp[8] = t0 - t1; | |
90 t1 = cos18[2] * (in[2] + in[14]); | |
91 t2 += t1 - t7; | |
92 | |
93 tmp[3] = t3 + t2; | |
94 t0 = COS6_1 * (in[10] + in[14] - in[2]); | |
95 tmp[5] = t3 - t2; | |
96 | |
97 t4 -= t1 + t7; | |
98 | |
99 tmp[1] = t5 - t0; | |
100 tmp[7] = t5 + t0; | |
101 tmp[2] = t6 + t4; | |
102 tmp[6] = t6 - t4; | |
103 } | |
104 | |
105 { | |
106 real t0, t1, t2, t3, t4, t5, t6, t7; | |
107 | |
108 t1 = COS6_2 * in[13]; | |
109 t2 = COS6_2 * (in[9] + in[17] - in[5]); | |
110 | |
111 t3 = in[1] + t1; | |
112 t4 = in[1] - t1 - t1; | |
113 t5 = t4 - t2; | |
114 | |
115 t0 = cos9[0] * (in[5] + in[9]); | |
116 t1 = cos9[1] * (in[9] - in[17]); | |
117 | |
118 tmp[13] = (t4 + t2 + t2) * tfcos36[17-13]; | |
119 t2 = cos9[2] * (in[5] + in[17]); | |
120 | |
121 t6 = t3 - t0 - t2; | |
122 t0 += t3 + t1; | |
123 t3 += t2 - t1; | |
124 | |
125 t2 = cos18[0] * (in[3] + in[11]); | |
126 t4 = cos18[1] * (in[11] - in[15]); | |
127 t7 = COS6_1 * in[7]; | |
128 | |
129 t1 = t2 + t4 + t7; | |
130 tmp[17] = (t0 + t1) * tfcos36[17-17]; | |
131 tmp[9] = (t0 - t1) * tfcos36[17-9]; | |
132 t1 = cos18[2] * (in[3] + in[15]); | |
133 t2 += t1 - t7; | |
134 | |
135 tmp[14] = (t3 + t2) * tfcos36[17-14]; | |
136 t0 = COS6_1 * (in[11] + in[15] - in[3]); | |
137 tmp[12] = (t3 - t2) * tfcos36[17-12]; | |
138 | |
139 t4 -= t1 + t7; | |
140 | |
141 tmp[16] = (t5 - t0) * tfcos36[17-16]; | |
142 tmp[10] = (t5 + t0) * tfcos36[17-10]; | |
143 tmp[15] = (t6 + t4) * tfcos36[17-15]; | |
144 tmp[11] = (t6 - t4) * tfcos36[17-11]; | |
145 } | |
146 | |
147 #define MACRO(v) { \ | |
148 real tmpval; \ | |
149 real sum0 = tmp[(v)]; \ | |
150 real sum1 = tmp[17-(v)]; \ | |
151 out2[9+(v)] = (tmpval = sum0 + sum1) * w[27+(v)]; \ | |
152 out2[8-(v)] = tmpval * w[26-(v)]; \ | |
153 sum0 -= sum1; \ | |
154 ts[SBLIMIT*(8-(v))] = out1[8-(v)] + sum0 * w[8-(v)]; \ | |
155 ts[SBLIMIT*(9+(v))] = out1[9+(v)] + sum0 * w[9+(v)]; } | |
156 | |
157 { | |
158 register real *out2 = o2; | |
159 register real *w = wintab; | |
160 register real *out1 = o1; | |
161 register real *ts = tsbuf; | |
162 | |
163 MACRO(0); | |
164 MACRO(1); | |
165 MACRO(2); | |
166 MACRO(3); | |
167 MACRO(4); | |
168 MACRO(5); | |
169 MACRO(6); | |
170 MACRO(7); | |
171 MACRO(8); | |
172 } | |
173 | |
174 #else | |
175 | |
176 { | |
177 | |
178 #define MACRO0(v) { \ | |
179 real tmp; \ | |
180 out2[9+(v)] = (tmp = sum0 + sum1) * w[27+(v)]; \ | |
181 out2[8-(v)] = tmp * w[26-(v)]; } \ | |
182 sum0 -= sum1; \ | |
183 ts[SBLIMIT*(8-(v))] = out1[8-(v)] + sum0 * w[8-(v)]; \ | |
184 ts[SBLIMIT*(9+(v))] = out1[9+(v)] + sum0 * w[9+(v)]; | |
185 #define MACRO1(v) { \ | |
186 real sum0,sum1; \ | |
187 sum0 = tmp1a + tmp2a; \ | |
188 sum1 = (tmp1b + tmp2b) * tfcos36[(v)]; \ | |
189 MACRO0(v); } | |
190 #define MACRO2(v) { \ | |
191 real sum0,sum1; \ | |
192 sum0 = tmp2a - tmp1a; \ | |
193 sum1 = (tmp2b - tmp1b) * tfcos36[(v)]; \ | |
194 MACRO0(v); } | |
195 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
196 register const real *c = COS9; |
1 | 197 register real *out2 = o2; |
198 register real *w = wintab; | |
199 register real *out1 = o1; | |
200 register real *ts = tsbuf; | |
201 | |
202 real ta33,ta66,tb33,tb66; | |
203 | |
204 ta33 = in[2*3+0] * c[3]; | |
205 ta66 = in[2*6+0] * c[6]; | |
206 tb33 = in[2*3+1] * c[3]; | |
207 tb66 = in[2*6+1] * c[6]; | |
208 | |
209 { | |
210 real tmp1a,tmp2a,tmp1b,tmp2b; | |
211 tmp1a = in[2*1+0] * c[1] + ta33 + in[2*5+0] * c[5] + in[2*7+0] * c[7]; | |
212 tmp1b = in[2*1+1] * c[1] + tb33 + in[2*5+1] * c[5] + in[2*7+1] * c[7]; | |
213 tmp2a = in[2*0+0] + in[2*2+0] * c[2] + in[2*4+0] * c[4] + ta66 + in[2*8+0] * c[8]; | |
214 tmp2b = in[2*0+1] + in[2*2+1] * c[2] + in[2*4+1] * c[4] + tb66 + in[2*8+1] * c[8]; | |
215 | |
216 MACRO1(0); | |
217 MACRO2(8); | |
218 } | |
219 | |
220 { | |
221 real tmp1a,tmp2a,tmp1b,tmp2b; | |
222 tmp1a = ( in[2*1+0] - in[2*5+0] - in[2*7+0] ) * c[3]; | |
223 tmp1b = ( in[2*1+1] - in[2*5+1] - in[2*7+1] ) * c[3]; | |
224 tmp2a = ( in[2*2+0] - in[2*4+0] - in[2*8+0] ) * c[6] - in[2*6+0] + in[2*0+0]; | |
225 tmp2b = ( in[2*2+1] - in[2*4+1] - in[2*8+1] ) * c[6] - in[2*6+1] + in[2*0+1]; | |
226 | |
227 MACRO1(1); | |
228 MACRO2(7); | |
229 } | |
230 | |
231 { | |
232 real tmp1a,tmp2a,tmp1b,tmp2b; | |
233 tmp1a = in[2*1+0] * c[5] - ta33 - in[2*5+0] * c[7] + in[2*7+0] * c[1]; | |
234 tmp1b = in[2*1+1] * c[5] - tb33 - in[2*5+1] * c[7] + in[2*7+1] * c[1]; | |
235 tmp2a = in[2*0+0] - in[2*2+0] * c[8] - in[2*4+0] * c[2] + ta66 + in[2*8+0] * c[4]; | |
236 tmp2b = in[2*0+1] - in[2*2+1] * c[8] - in[2*4+1] * c[2] + tb66 + in[2*8+1] * c[4]; | |
237 | |
238 MACRO1(2); | |
239 MACRO2(6); | |
240 } | |
241 | |
242 { | |
243 real tmp1a,tmp2a,tmp1b,tmp2b; | |
244 tmp1a = in[2*1+0] * c[7] - ta33 + in[2*5+0] * c[1] - in[2*7+0] * c[5]; | |
245 tmp1b = in[2*1+1] * c[7] - tb33 + in[2*5+1] * c[1] - in[2*7+1] * c[5]; | |
246 tmp2a = in[2*0+0] - in[2*2+0] * c[4] + in[2*4+0] * c[8] + ta66 - in[2*8+0] * c[2]; | |
247 tmp2b = in[2*0+1] - in[2*2+1] * c[4] + in[2*4+1] * c[8] + tb66 - in[2*8+1] * c[2]; | |
248 | |
249 MACRO1(3); | |
250 MACRO2(5); | |
251 } | |
252 | |
253 { | |
254 real sum0,sum1; | |
255 sum0 = in[2*0+0] - in[2*2+0] + in[2*4+0] - in[2*6+0] + in[2*8+0]; | |
256 sum1 = (in[2*0+1] - in[2*2+1] + in[2*4+1] - in[2*6+1] + in[2*8+1] ) * tfcos36[4]; | |
257 MACRO0(4); | |
258 } | |
259 } | |
260 #endif | |
261 | |
262 } | |
263 } | |
264 |