Mercurial > mplayer.hg
annotate mp3lib/dct36.c @ 4689:61f4b8fd380e
Fixing "quake" by direct waiting of vsync.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
(I don't why - but SMART_SWITCH is always disabled on my card)
Benchmarks:
[SRC] VIDEO: [DIV3] 624x356 24bpp 24.00 fps 497.3 kbps (60.7 kbyte/s)
[DEST] 1024x768@32 70fps (-xvidix -fs -zoom)
-vc ffdivx -double:
BENCHMARKs: V: 3.838s VO: 7.305s A: 0.555s Sys: 18.264s = 29.962s
BENCHMARK%: V: 12.8110% VO: 24.3808% A: 1.8518% Sys: 60.9564% = 100.0000%
total video time: 11.143s
-vc ffdivx -nodouble:
BENCHMARKs: V: 3.846s VO: 1.668s A: 0.539s Sys: 23.869s = 29.922s
BENCHMARK%: V: 12.8525% VO: 5.5744% A: 1.8015% Sys: 79.7716% = 100.0000%
total video time: 5.514s
-vc divxds -double (direct rendering)
BENCHMARKs: V: 8.275s VO: 5.750s A: 0.532s Sys: 15.414s = 29.971s
BENCHMARK%: V: 27.6115% VO: 19.1850% A: 1.7737% Sys: 51.4298% = 100.0000%
total video time: 14.070s
-vc divxds -nodouble (direct rendering)
BENCHMARKs: V: 7.353s VO: 0.002s A: 0.521s Sys: 22.083s = 29.958s
BENCHMARK%: V: 24.5433% VO: 0.0052% A: 1.7382% Sys: 73.7133% = 100.0000%
total video time: 7.355s
Unfortunately we have dramatic lost of performance (100%) :(
author | nick |
---|---|
date | Wed, 13 Feb 2002 08:24:13 +0000 |
parents | 03b7e2955a20 |
children | 07e7a572bd84 |
rev | line source |
---|---|
1 | 1 /* |
2 // This is an optimized DCT from Jeff Tsay's maplay 1.2+ package. | |
3 // Saved one multiplication by doing the 'twiddle factor' stuff | |
4 // together with the window mul. (MH) | |
5 // | |
6 // This uses Byeong Gi Lee's Fast Cosine Transform algorithm, but the | |
7 // 9 point IDCT needs to be reduced further. Unfortunately, I don't | |
8 // know how to do that, because 9 is not an even number. - Jeff. | |
9 // | |
10 ////////////////////////////////////////////////////////////////// | |
11 // | |
12 // 9 Point Inverse Discrete Cosine Transform | |
13 // | |
14 // This piece of code is Copyright 1997 Mikko Tommila and is freely usable | |
15 // by anybody. The algorithm itself is of course in the public domain. | |
16 // | |
17 // Again derived heuristically from the 9-point WFTA. | |
18 // | |
19 // The algorithm is optimized (?) for speed, not for small rounding errors or | |
20 // good readability. | |
21 // | |
22 // 36 additions, 11 multiplications | |
23 // | |
24 // Again this is very likely sub-optimal. | |
25 // | |
26 // The code is optimized to use a minimum number of temporary variables, | |
27 // so it should compile quite well even on 8-register Intel x86 processors. | |
28 // This makes the code quite obfuscated and very difficult to understand. | |
29 // | |
30 // References: | |
31 // [1] S. Winograd: "On Computing the Discrete Fourier Transform", | |
32 // Mathematics of Computation, Volume 32, Number 141, January 1978, | |
33 // Pages 175-199 | |
34 */ | |
35 | |
36 /*------------------------------------------------------------------*/ | |
37 /* */ | |
38 /* Function: Calculation of the inverse MDCT */ | |
39 /* */ | |
40 /*------------------------------------------------------------------*/ | |
41 | |
42 static void dct36(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf) | |
43 { | |
44 #ifdef NEW_DCT9 | |
45 real tmp[18]; | |
46 #endif | |
47 | |
48 { | |
49 register real *in = inbuf; | |
50 | |
51 in[17]+=in[16]; in[16]+=in[15]; in[15]+=in[14]; | |
52 in[14]+=in[13]; in[13]+=in[12]; in[12]+=in[11]; | |
53 in[11]+=in[10]; in[10]+=in[9]; in[9] +=in[8]; | |
54 in[8] +=in[7]; in[7] +=in[6]; in[6] +=in[5]; | |
55 in[5] +=in[4]; in[4] +=in[3]; in[3] +=in[2]; | |
56 in[2] +=in[1]; in[1] +=in[0]; | |
57 | |
58 in[17]+=in[15]; in[15]+=in[13]; in[13]+=in[11]; in[11]+=in[9]; | |
59 in[9] +=in[7]; in[7] +=in[5]; in[5] +=in[3]; in[3] +=in[1]; | |
60 | |
61 | |
62 #ifdef NEW_DCT9 | |
63 { | |
64 real t0, t1, t2, t3, t4, t5, t6, t7; | |
65 | |
66 t1 = COS6_2 * in[12]; | |
67 t2 = COS6_2 * (in[8] + in[16] - in[4]); | |
68 | |
69 t3 = in[0] + t1; | |
70 t4 = in[0] - t1 - t1; | |
71 t5 = t4 - t2; | |
72 | |
73 t0 = cos9[0] * (in[4] + in[8]); | |
74 t1 = cos9[1] * (in[8] - in[16]); | |
75 | |
76 tmp[4] = t4 + t2 + t2; | |
77 t2 = cos9[2] * (in[4] + in[16]); | |
78 | |
79 t6 = t3 - t0 - t2; | |
80 t0 += t3 + t1; | |
81 t3 += t2 - t1; | |
82 | |
83 t2 = cos18[0] * (in[2] + in[10]); | |
84 t4 = cos18[1] * (in[10] - in[14]); | |
85 t7 = COS6_1 * in[6]; | |
86 | |
87 t1 = t2 + t4 + t7; | |
88 tmp[0] = t0 + t1; | |
89 tmp[8] = t0 - t1; | |
90 t1 = cos18[2] * (in[2] + in[14]); | |
91 t2 += t1 - t7; | |
92 | |
93 tmp[3] = t3 + t2; | |
94 t0 = COS6_1 * (in[10] + in[14] - in[2]); | |
95 tmp[5] = t3 - t2; | |
96 | |
97 t4 -= t1 + t7; | |
98 | |
99 tmp[1] = t5 - t0; | |
100 tmp[7] = t5 + t0; | |
101 tmp[2] = t6 + t4; | |
102 tmp[6] = t6 - t4; | |
103 } | |
104 | |
105 { | |
106 real t0, t1, t2, t3, t4, t5, t6, t7; | |
107 | |
108 t1 = COS6_2 * in[13]; | |
109 t2 = COS6_2 * (in[9] + in[17] - in[5]); | |
110 | |
111 t3 = in[1] + t1; | |
112 t4 = in[1] - t1 - t1; | |
113 t5 = t4 - t2; | |
114 | |
115 t0 = cos9[0] * (in[5] + in[9]); | |
116 t1 = cos9[1] * (in[9] - in[17]); | |
117 | |
118 tmp[13] = (t4 + t2 + t2) * tfcos36[17-13]; | |
119 t2 = cos9[2] * (in[5] + in[17]); | |
120 | |
121 t6 = t3 - t0 - t2; | |
122 t0 += t3 + t1; | |
123 t3 += t2 - t1; | |
124 | |
125 t2 = cos18[0] * (in[3] + in[11]); | |
126 t4 = cos18[1] * (in[11] - in[15]); | |
127 t7 = COS6_1 * in[7]; | |
128 | |
129 t1 = t2 + t4 + t7; | |
130 tmp[17] = (t0 + t1) * tfcos36[17-17]; | |
131 tmp[9] = (t0 - t1) * tfcos36[17-9]; | |
132 t1 = cos18[2] * (in[3] + in[15]); | |
133 t2 += t1 - t7; | |
134 | |
135 tmp[14] = (t3 + t2) * tfcos36[17-14]; | |
136 t0 = COS6_1 * (in[11] + in[15] - in[3]); | |
137 tmp[12] = (t3 - t2) * tfcos36[17-12]; | |
138 | |
139 t4 -= t1 + t7; | |
140 | |
141 tmp[16] = (t5 - t0) * tfcos36[17-16]; | |
142 tmp[10] = (t5 + t0) * tfcos36[17-10]; | |
143 tmp[15] = (t6 + t4) * tfcos36[17-15]; | |
144 tmp[11] = (t6 - t4) * tfcos36[17-11]; | |
145 } | |
146 | |
147 #define MACRO(v) { \ | |
148 real tmpval; \ | |
149 real sum0 = tmp[(v)]; \ | |
150 real sum1 = tmp[17-(v)]; \ | |
151 out2[9+(v)] = (tmpval = sum0 + sum1) * w[27+(v)]; \ | |
152 out2[8-(v)] = tmpval * w[26-(v)]; \ | |
153 sum0 -= sum1; \ | |
154 ts[SBLIMIT*(8-(v))] = out1[8-(v)] + sum0 * w[8-(v)]; \ | |
155 ts[SBLIMIT*(9+(v))] = out1[9+(v)] + sum0 * w[9+(v)]; } | |
156 | |
157 { | |
158 register real *out2 = o2; | |
159 register real *w = wintab; | |
160 register real *out1 = o1; | |
161 register real *ts = tsbuf; | |
162 | |
163 MACRO(0); | |
164 MACRO(1); | |
165 MACRO(2); | |
166 MACRO(3); | |
167 MACRO(4); | |
168 MACRO(5); | |
169 MACRO(6); | |
170 MACRO(7); | |
171 MACRO(8); | |
172 } | |
173 | |
174 #else | |
175 | |
176 { | |
177 | |
178 #define MACRO0(v) { \ | |
179 real tmp; \ | |
180 out2[9+(v)] = (tmp = sum0 + sum1) * w[27+(v)]; \ | |
181 out2[8-(v)] = tmp * w[26-(v)]; } \ | |
182 sum0 -= sum1; \ | |
183 ts[SBLIMIT*(8-(v))] = out1[8-(v)] + sum0 * w[8-(v)]; \ | |
184 ts[SBLIMIT*(9+(v))] = out1[9+(v)] + sum0 * w[9+(v)]; | |
185 #define MACRO1(v) { \ | |
186 real sum0,sum1; \ | |
187 sum0 = tmp1a + tmp2a; \ | |
188 sum1 = (tmp1b + tmp2b) * tfcos36[(v)]; \ | |
189 MACRO0(v); } | |
190 #define MACRO2(v) { \ | |
191 real sum0,sum1; \ | |
192 sum0 = tmp2a - tmp1a; \ | |
193 sum1 = (tmp2b - tmp1b) * tfcos36[(v)]; \ | |
194 MACRO0(v); } | |
195 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
196 register const real *c = COS9; |
1 | 197 register real *out2 = o2; |
198 register real *w = wintab; | |
199 register real *out1 = o1; | |
200 register real *ts = tsbuf; | |
201 | |
202 real ta33,ta66,tb33,tb66; | |
203 | |
204 ta33 = in[2*3+0] * c[3]; | |
205 ta66 = in[2*6+0] * c[6]; | |
206 tb33 = in[2*3+1] * c[3]; | |
207 tb66 = in[2*6+1] * c[6]; | |
208 | |
209 { | |
210 real tmp1a,tmp2a,tmp1b,tmp2b; | |
211 tmp1a = in[2*1+0] * c[1] + ta33 + in[2*5+0] * c[5] + in[2*7+0] * c[7]; | |
212 tmp1b = in[2*1+1] * c[1] + tb33 + in[2*5+1] * c[5] + in[2*7+1] * c[7]; | |
213 tmp2a = in[2*0+0] + in[2*2+0] * c[2] + in[2*4+0] * c[4] + ta66 + in[2*8+0] * c[8]; | |
214 tmp2b = in[2*0+1] + in[2*2+1] * c[2] + in[2*4+1] * c[4] + tb66 + in[2*8+1] * c[8]; | |
215 | |
216 MACRO1(0); | |
217 MACRO2(8); | |
218 } | |
219 | |
220 { | |
221 real tmp1a,tmp2a,tmp1b,tmp2b; | |
222 tmp1a = ( in[2*1+0] - in[2*5+0] - in[2*7+0] ) * c[3]; | |
223 tmp1b = ( in[2*1+1] - in[2*5+1] - in[2*7+1] ) * c[3]; | |
224 tmp2a = ( in[2*2+0] - in[2*4+0] - in[2*8+0] ) * c[6] - in[2*6+0] + in[2*0+0]; | |
225 tmp2b = ( in[2*2+1] - in[2*4+1] - in[2*8+1] ) * c[6] - in[2*6+1] + in[2*0+1]; | |
226 | |
227 MACRO1(1); | |
228 MACRO2(7); | |
229 } | |
230 | |
231 { | |
232 real tmp1a,tmp2a,tmp1b,tmp2b; | |
233 tmp1a = in[2*1+0] * c[5] - ta33 - in[2*5+0] * c[7] + in[2*7+0] * c[1]; | |
234 tmp1b = in[2*1+1] * c[5] - tb33 - in[2*5+1] * c[7] + in[2*7+1] * c[1]; | |
235 tmp2a = in[2*0+0] - in[2*2+0] * c[8] - in[2*4+0] * c[2] + ta66 + in[2*8+0] * c[4]; | |
236 tmp2b = in[2*0+1] - in[2*2+1] * c[8] - in[2*4+1] * c[2] + tb66 + in[2*8+1] * c[4]; | |
237 | |
238 MACRO1(2); | |
239 MACRO2(6); | |
240 } | |
241 | |
242 { | |
243 real tmp1a,tmp2a,tmp1b,tmp2b; | |
244 tmp1a = in[2*1+0] * c[7] - ta33 + in[2*5+0] * c[1] - in[2*7+0] * c[5]; | |
245 tmp1b = in[2*1+1] * c[7] - tb33 + in[2*5+1] * c[1] - in[2*7+1] * c[5]; | |
246 tmp2a = in[2*0+0] - in[2*2+0] * c[4] + in[2*4+0] * c[8] + ta66 - in[2*8+0] * c[2]; | |
247 tmp2b = in[2*0+1] - in[2*2+1] * c[4] + in[2*4+1] * c[8] + tb66 - in[2*8+1] * c[2]; | |
248 | |
249 MACRO1(3); | |
250 MACRO2(5); | |
251 } | |
252 | |
253 { | |
254 real sum0,sum1; | |
255 sum0 = in[2*0+0] - in[2*2+0] + in[2*4+0] - in[2*6+0] + in[2*8+0]; | |
256 sum1 = (in[2*0+1] - in[2*2+1] + in[2*4+1] - in[2*6+1] + in[2*8+1] ) * tfcos36[4]; | |
257 MACRO0(4); | |
258 } | |
259 } | |
260 #endif | |
261 | |
262 } | |
263 } | |
264 |