Mercurial > libavcodec.hg
comparison alpha/simple_idct_alpha.c @ 744:2f7da29ede37 libavcodec
Move Alpha optimized IDCT to own file. Based on a patch by M«©ns
Rullg«©rd <mru@users.sourceforge.net>.
I've left out the idctCol2 part, because W4 has recently been decreed
to be 16383, and also I doubt it will give a noticeable speedup.
author | mellum |
---|---|
date | Fri, 11 Oct 2002 23:01:16 +0000 |
parents | |
children | 3dbbdc2f8bd3 |
comparison
equal
deleted
inserted
replaced
743:4cf7173a004e | 744:2f7da29ede37 |
---|---|
1 /* | |
2 * Simple IDCT (Alpha optimized) | |
3 * | |
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> | |
5 * | |
6 * This library is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2 of the License, or (at your option) any later version. | |
10 * | |
11 * This library is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with this library; if not, write to the Free Software | |
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
19 * | |
20 * based upon some outcommented c code from mpeg2dec (idct_mmx.c | |
21 * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>) | |
22 * | |
23 * Alpha optimiziations by Måns Rullgård <mru@users.sourceforge.net> | |
24 * and Falk Hueffner <falk@debian.org> | |
25 */ | |
26 | |
27 #include "asm.h" | |
28 #include "../dsputil.h" | |
29 | |
30 // cos(i * M_PI / 16) * sqrt(2) * (1 << 14) | |
31 // W4 is actually exactly 16384, but using 16383 works around | |
32 // accumulating rounding errors for some encoders | |
33 #define W1 ((int_fast32_t) 22725) | |
34 #define W2 ((int_fast32_t) 21407) | |
35 #define W3 ((int_fast32_t) 19266) | |
36 #define W4 ((int_fast32_t) 16383) | |
37 #define W5 ((int_fast32_t) 12873) | |
38 #define W6 ((int_fast32_t) 8867) | |
39 #define W7 ((int_fast32_t) 4520) | |
40 #define ROW_SHIFT 11 | |
41 #define COL_SHIFT 20 | |
42 | |
43 /* 0: all entries 0, 1: only first entry nonzero, 2: otherwise */ | |
44 static inline int idct_row(DCTELEM *row) | |
45 { | |
46 int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3, t; | |
47 uint64_t l, r; | |
48 l = ldq(row); | |
49 r = ldq(row + 4); | |
50 | |
51 if (l == 0 && r == 0) | |
52 return 0; | |
53 | |
54 a0 = W4 * sextw(l) + (1 << (ROW_SHIFT - 1)); | |
55 | |
56 if (((l & ~0xffffUL) | r) == 0) { | |
57 a0 >>= ROW_SHIFT; | |
58 a0 = (uint16_t) a0; | |
59 a0 |= a0 << 16; | |
60 a0 |= a0 << 32; | |
61 | |
62 stq(a0, row); | |
63 stq(a0, row + 4); | |
64 return 1; | |
65 } | |
66 | |
67 a1 = a0; | |
68 a2 = a0; | |
69 a3 = a0; | |
70 | |
71 t = extwl(l, 4); /* row[2] */ | |
72 if (t != 0) { | |
73 t = sextw(t); | |
74 a0 += W2 * t; | |
75 a1 += W6 * t; | |
76 a2 -= W6 * t; | |
77 a3 -= W2 * t; | |
78 } | |
79 | |
80 t = extwl(r, 0); /* row[4] */ | |
81 if (t != 0) { | |
82 t = sextw(t); | |
83 a0 += W4 * t; | |
84 a1 -= W4 * t; | |
85 a2 -= W4 * t; | |
86 a3 += W4 * t; | |
87 } | |
88 | |
89 t = extwl(r, 4); /* row[6] */ | |
90 if (t != 0) { | |
91 t = sextw(t); | |
92 a0 += W6 * t; | |
93 a1 -= W2 * t; | |
94 a2 += W2 * t; | |
95 a3 -= W6 * t; | |
96 } | |
97 | |
98 t = extwl(l, 2); /* row[1] */ | |
99 if (t != 0) { | |
100 t = sextw(t); | |
101 b0 = W1 * t; | |
102 b1 = W3 * t; | |
103 b2 = W5 * t; | |
104 b3 = W7 * t; | |
105 } else { | |
106 b0 = 0; | |
107 b1 = 0; | |
108 b2 = 0; | |
109 b3 = 0; | |
110 } | |
111 | |
112 t = extwl(l, 6); /* row[3] */ | |
113 if (t) { | |
114 t = sextw(t); | |
115 b0 += W3 * t; | |
116 b1 -= W7 * t; | |
117 b2 -= W1 * t; | |
118 b3 -= W5 * t; | |
119 } | |
120 | |
121 | |
122 t = extwl(r, 2); /* row[5] */ | |
123 if (t) { | |
124 t = sextw(t); | |
125 b0 += W5 * t; | |
126 b1 -= W1 * t; | |
127 b2 += W7 * t; | |
128 b3 += W3 * t; | |
129 } | |
130 | |
131 t = extwl(r, 6); /* row[7] */ | |
132 if (t) { | |
133 t = sextw(t); | |
134 b0 += W7 * t; | |
135 b1 -= W5 * t; | |
136 b2 += W3 * t; | |
137 b3 -= W1 * t; | |
138 } | |
139 | |
140 row[0] = (a0 + b0) >> ROW_SHIFT; | |
141 row[1] = (a1 + b1) >> ROW_SHIFT; | |
142 row[2] = (a2 + b2) >> ROW_SHIFT; | |
143 row[3] = (a3 + b3) >> ROW_SHIFT; | |
144 row[4] = (a3 - b3) >> ROW_SHIFT; | |
145 row[5] = (a2 - b2) >> ROW_SHIFT; | |
146 row[6] = (a1 - b1) >> ROW_SHIFT; | |
147 row[7] = (a0 - b0) >> ROW_SHIFT; | |
148 | |
149 return 2; | |
150 } | |
151 | |
152 static inline void idct_col(DCTELEM *col) | |
153 { | |
154 int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3; | |
155 | |
156 col[0] += (1 << (COL_SHIFT - 1)) / W4; | |
157 | |
158 a0 = W4 * col[8 * 0]; | |
159 a1 = W4 * col[8 * 0]; | |
160 a2 = W4 * col[8 * 0]; | |
161 a3 = W4 * col[8 * 0]; | |
162 | |
163 if (col[8 * 2]) { | |
164 a0 += W2 * col[8 * 2]; | |
165 a1 += W6 * col[8 * 2]; | |
166 a2 -= W6 * col[8 * 2]; | |
167 a3 -= W2 * col[8 * 2]; | |
168 } | |
169 | |
170 if (col[8 * 4]) { | |
171 a0 += W4 * col[8 * 4]; | |
172 a1 -= W4 * col[8 * 4]; | |
173 a2 -= W4 * col[8 * 4]; | |
174 a3 += W4 * col[8 * 4]; | |
175 } | |
176 | |
177 if (col[8 * 6]) { | |
178 a0 += W6 * col[8 * 6]; | |
179 a1 -= W2 * col[8 * 6]; | |
180 a2 += W2 * col[8 * 6]; | |
181 a3 -= W6 * col[8 * 6]; | |
182 } | |
183 | |
184 if (col[8 * 1]) { | |
185 b0 = W1 * col[8 * 1]; | |
186 b1 = W3 * col[8 * 1]; | |
187 b2 = W5 * col[8 * 1]; | |
188 b3 = W7 * col[8 * 1]; | |
189 } else { | |
190 b0 = 0; | |
191 b1 = 0; | |
192 b2 = 0; | |
193 b3 = 0; | |
194 } | |
195 | |
196 if (col[8 * 3]) { | |
197 b0 += W3 * col[8 * 3]; | |
198 b1 -= W7 * col[8 * 3]; | |
199 b2 -= W1 * col[8 * 3]; | |
200 b3 -= W5 * col[8 * 3]; | |
201 } | |
202 | |
203 if (col[8 * 5]) { | |
204 b0 += W5 * col[8 * 5]; | |
205 b1 -= W1 * col[8 * 5]; | |
206 b2 += W7 * col[8 * 5]; | |
207 b3 += W3 * col[8 * 5]; | |
208 } | |
209 | |
210 if (col[8 * 7]) { | |
211 b0 += W7 * col[8 * 7]; | |
212 b1 -= W5 * col[8 * 7]; | |
213 b2 += W3 * col[8 * 7]; | |
214 b3 -= W1 * col[8 * 7]; | |
215 } | |
216 | |
217 col[8 * 0] = (a0 + b0) >> COL_SHIFT; | |
218 col[8 * 7] = (a0 - b0) >> COL_SHIFT; | |
219 col[8 * 1] = (a1 + b1) >> COL_SHIFT; | |
220 col[8 * 6] = (a1 - b1) >> COL_SHIFT; | |
221 col[8 * 2] = (a2 + b2) >> COL_SHIFT; | |
222 col[8 * 5] = (a2 - b2) >> COL_SHIFT; | |
223 col[8 * 3] = (a3 + b3) >> COL_SHIFT; | |
224 col[8 * 4] = (a3 - b3) >> COL_SHIFT; | |
225 } | |
226 | |
227 /* If all rows but the first one are zero after row transformation, | |
228 all rows will be identical after column transformation. */ | |
229 static inline void idct_col2(DCTELEM *col) | |
230 { | |
231 int i; | |
232 uint64_t l, r; | |
233 uint64_t *lcol = (uint64_t *) col; | |
234 | |
235 for (i = 0; i < 8; ++i) { | |
236 int_fast32_t a0 = col[0] + (1 << (COL_SHIFT - 1)) / W4; | |
237 | |
238 a0 *= W4; | |
239 col[0] = a0 >> COL_SHIFT; | |
240 ++col; | |
241 } | |
242 | |
243 l = lcol[0]; | |
244 r = lcol[1]; | |
245 lcol[ 2] = l; lcol[ 3] = r; | |
246 lcol[ 4] = l; lcol[ 5] = r; | |
247 lcol[ 6] = l; lcol[ 7] = r; | |
248 lcol[ 8] = l; lcol[ 9] = r; | |
249 lcol[10] = l; lcol[11] = r; | |
250 lcol[12] = l; lcol[13] = r; | |
251 lcol[14] = l; lcol[15] = r; | |
252 } | |
253 | |
254 void simple_idct_axp(DCTELEM *block) | |
255 { | |
256 | |
257 int i; | |
258 int rowsZero = 1; /* all rows except row 0 zero */ | |
259 int rowsConstant = 1; /* all rows consist of a constant value */ | |
260 | |
261 for (i = 0; i < 8; i++) { | |
262 int sparseness = idct_row(block + 8 * i); | |
263 | |
264 if (i > 0 && sparseness > 0) | |
265 rowsZero = 0; | |
266 if (sparseness == 2) | |
267 rowsConstant = 0; | |
268 } | |
269 | |
270 if (rowsZero) { | |
271 idct_col2(block); | |
272 } else if (rowsConstant) { | |
273 uint64_t *lblock = (uint64_t *) block; | |
274 | |
275 idct_col(block); | |
276 for (i = 0; i < 8; i += 2) { | |
277 uint64_t v = (uint16_t) block[i * 8]; | |
278 uint64_t w = (uint16_t) block[i * 8 + 8]; | |
279 | |
280 v |= v << 16; | |
281 w |= w << 16; | |
282 v |= v << 32; | |
283 w |= w << 32; | |
284 lblock[0] = v; | |
285 lblock[1] = v; | |
286 lblock[2] = w; | |
287 lblock[3] = w; | |
288 lblock += 4; | |
289 } | |
290 } else { | |
291 for (i = 0; i < 8; i++) | |
292 idct_col(block + i); | |
293 } | |
294 } | |
295 | |
296 void simple_idct_put_axp(uint8_t *dest, int line_size, DCTELEM *block) | |
297 { | |
298 simple_idct_axp(block); | |
299 put_pixels_clamped(block, dest, line_size); | |
300 } | |
301 | |
302 void simple_idct_add_axp(uint8_t *dest, int line_size, DCTELEM *block) | |
303 { | |
304 simple_idct_axp(block); | |
305 add_pixels_clamped(block, dest, line_size); | |
306 } |