comparison alpha/simple_idct_alpha.c @ 744:2f7da29ede37 libavcodec

Move Alpha optimized IDCT to own file. Based on a patch by M«©ns Rullg«©rd <mru@users.sourceforge.net>. I've left out the idctCol2 part, because W4 has recently been decreed to be 16383, and also I doubt it will give a noticeable speedup.
author mellum
date Fri, 11 Oct 2002 23:01:16 +0000
parents
children 3dbbdc2f8bd3
comparison
equal deleted inserted replaced
743:4cf7173a004e 744:2f7da29ede37
1 /*
2 * Simple IDCT (Alpha optimized)
3 *
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 * based upon some outcommented c code from mpeg2dec (idct_mmx.c
21 * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
22 *
23 * Alpha optimiziations by Måns Rullgård <mru@users.sourceforge.net>
24 * and Falk Hueffner <falk@debian.org>
25 */
26
27 #include "asm.h"
28 #include "../dsputil.h"
29
30 // cos(i * M_PI / 16) * sqrt(2) * (1 << 14)
31 // W4 is actually exactly 16384, but using 16383 works around
32 // accumulating rounding errors for some encoders
33 #define W1 ((int_fast32_t) 22725)
34 #define W2 ((int_fast32_t) 21407)
35 #define W3 ((int_fast32_t) 19266)
36 #define W4 ((int_fast32_t) 16383)
37 #define W5 ((int_fast32_t) 12873)
38 #define W6 ((int_fast32_t) 8867)
39 #define W7 ((int_fast32_t) 4520)
40 #define ROW_SHIFT 11
41 #define COL_SHIFT 20
42
43 /* 0: all entries 0, 1: only first entry nonzero, 2: otherwise */
44 static inline int idct_row(DCTELEM *row)
45 {
46 int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3, t;
47 uint64_t l, r;
48 l = ldq(row);
49 r = ldq(row + 4);
50
51 if (l == 0 && r == 0)
52 return 0;
53
54 a0 = W4 * sextw(l) + (1 << (ROW_SHIFT - 1));
55
56 if (((l & ~0xffffUL) | r) == 0) {
57 a0 >>= ROW_SHIFT;
58 a0 = (uint16_t) a0;
59 a0 |= a0 << 16;
60 a0 |= a0 << 32;
61
62 stq(a0, row);
63 stq(a0, row + 4);
64 return 1;
65 }
66
67 a1 = a0;
68 a2 = a0;
69 a3 = a0;
70
71 t = extwl(l, 4); /* row[2] */
72 if (t != 0) {
73 t = sextw(t);
74 a0 += W2 * t;
75 a1 += W6 * t;
76 a2 -= W6 * t;
77 a3 -= W2 * t;
78 }
79
80 t = extwl(r, 0); /* row[4] */
81 if (t != 0) {
82 t = sextw(t);
83 a0 += W4 * t;
84 a1 -= W4 * t;
85 a2 -= W4 * t;
86 a3 += W4 * t;
87 }
88
89 t = extwl(r, 4); /* row[6] */
90 if (t != 0) {
91 t = sextw(t);
92 a0 += W6 * t;
93 a1 -= W2 * t;
94 a2 += W2 * t;
95 a3 -= W6 * t;
96 }
97
98 t = extwl(l, 2); /* row[1] */
99 if (t != 0) {
100 t = sextw(t);
101 b0 = W1 * t;
102 b1 = W3 * t;
103 b2 = W5 * t;
104 b3 = W7 * t;
105 } else {
106 b0 = 0;
107 b1 = 0;
108 b2 = 0;
109 b3 = 0;
110 }
111
112 t = extwl(l, 6); /* row[3] */
113 if (t) {
114 t = sextw(t);
115 b0 += W3 * t;
116 b1 -= W7 * t;
117 b2 -= W1 * t;
118 b3 -= W5 * t;
119 }
120
121
122 t = extwl(r, 2); /* row[5] */
123 if (t) {
124 t = sextw(t);
125 b0 += W5 * t;
126 b1 -= W1 * t;
127 b2 += W7 * t;
128 b3 += W3 * t;
129 }
130
131 t = extwl(r, 6); /* row[7] */
132 if (t) {
133 t = sextw(t);
134 b0 += W7 * t;
135 b1 -= W5 * t;
136 b2 += W3 * t;
137 b3 -= W1 * t;
138 }
139
140 row[0] = (a0 + b0) >> ROW_SHIFT;
141 row[1] = (a1 + b1) >> ROW_SHIFT;
142 row[2] = (a2 + b2) >> ROW_SHIFT;
143 row[3] = (a3 + b3) >> ROW_SHIFT;
144 row[4] = (a3 - b3) >> ROW_SHIFT;
145 row[5] = (a2 - b2) >> ROW_SHIFT;
146 row[6] = (a1 - b1) >> ROW_SHIFT;
147 row[7] = (a0 - b0) >> ROW_SHIFT;
148
149 return 2;
150 }
151
152 static inline void idct_col(DCTELEM *col)
153 {
154 int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3;
155
156 col[0] += (1 << (COL_SHIFT - 1)) / W4;
157
158 a0 = W4 * col[8 * 0];
159 a1 = W4 * col[8 * 0];
160 a2 = W4 * col[8 * 0];
161 a3 = W4 * col[8 * 0];
162
163 if (col[8 * 2]) {
164 a0 += W2 * col[8 * 2];
165 a1 += W6 * col[8 * 2];
166 a2 -= W6 * col[8 * 2];
167 a3 -= W2 * col[8 * 2];
168 }
169
170 if (col[8 * 4]) {
171 a0 += W4 * col[8 * 4];
172 a1 -= W4 * col[8 * 4];
173 a2 -= W4 * col[8 * 4];
174 a3 += W4 * col[8 * 4];
175 }
176
177 if (col[8 * 6]) {
178 a0 += W6 * col[8 * 6];
179 a1 -= W2 * col[8 * 6];
180 a2 += W2 * col[8 * 6];
181 a3 -= W6 * col[8 * 6];
182 }
183
184 if (col[8 * 1]) {
185 b0 = W1 * col[8 * 1];
186 b1 = W3 * col[8 * 1];
187 b2 = W5 * col[8 * 1];
188 b3 = W7 * col[8 * 1];
189 } else {
190 b0 = 0;
191 b1 = 0;
192 b2 = 0;
193 b3 = 0;
194 }
195
196 if (col[8 * 3]) {
197 b0 += W3 * col[8 * 3];
198 b1 -= W7 * col[8 * 3];
199 b2 -= W1 * col[8 * 3];
200 b3 -= W5 * col[8 * 3];
201 }
202
203 if (col[8 * 5]) {
204 b0 += W5 * col[8 * 5];
205 b1 -= W1 * col[8 * 5];
206 b2 += W7 * col[8 * 5];
207 b3 += W3 * col[8 * 5];
208 }
209
210 if (col[8 * 7]) {
211 b0 += W7 * col[8 * 7];
212 b1 -= W5 * col[8 * 7];
213 b2 += W3 * col[8 * 7];
214 b3 -= W1 * col[8 * 7];
215 }
216
217 col[8 * 0] = (a0 + b0) >> COL_SHIFT;
218 col[8 * 7] = (a0 - b0) >> COL_SHIFT;
219 col[8 * 1] = (a1 + b1) >> COL_SHIFT;
220 col[8 * 6] = (a1 - b1) >> COL_SHIFT;
221 col[8 * 2] = (a2 + b2) >> COL_SHIFT;
222 col[8 * 5] = (a2 - b2) >> COL_SHIFT;
223 col[8 * 3] = (a3 + b3) >> COL_SHIFT;
224 col[8 * 4] = (a3 - b3) >> COL_SHIFT;
225 }
226
227 /* If all rows but the first one are zero after row transformation,
228 all rows will be identical after column transformation. */
229 static inline void idct_col2(DCTELEM *col)
230 {
231 int i;
232 uint64_t l, r;
233 uint64_t *lcol = (uint64_t *) col;
234
235 for (i = 0; i < 8; ++i) {
236 int_fast32_t a0 = col[0] + (1 << (COL_SHIFT - 1)) / W4;
237
238 a0 *= W4;
239 col[0] = a0 >> COL_SHIFT;
240 ++col;
241 }
242
243 l = lcol[0];
244 r = lcol[1];
245 lcol[ 2] = l; lcol[ 3] = r;
246 lcol[ 4] = l; lcol[ 5] = r;
247 lcol[ 6] = l; lcol[ 7] = r;
248 lcol[ 8] = l; lcol[ 9] = r;
249 lcol[10] = l; lcol[11] = r;
250 lcol[12] = l; lcol[13] = r;
251 lcol[14] = l; lcol[15] = r;
252 }
253
254 void simple_idct_axp(DCTELEM *block)
255 {
256
257 int i;
258 int rowsZero = 1; /* all rows except row 0 zero */
259 int rowsConstant = 1; /* all rows consist of a constant value */
260
261 for (i = 0; i < 8; i++) {
262 int sparseness = idct_row(block + 8 * i);
263
264 if (i > 0 && sparseness > 0)
265 rowsZero = 0;
266 if (sparseness == 2)
267 rowsConstant = 0;
268 }
269
270 if (rowsZero) {
271 idct_col2(block);
272 } else if (rowsConstant) {
273 uint64_t *lblock = (uint64_t *) block;
274
275 idct_col(block);
276 for (i = 0; i < 8; i += 2) {
277 uint64_t v = (uint16_t) block[i * 8];
278 uint64_t w = (uint16_t) block[i * 8 + 8];
279
280 v |= v << 16;
281 w |= w << 16;
282 v |= v << 32;
283 w |= w << 32;
284 lblock[0] = v;
285 lblock[1] = v;
286 lblock[2] = w;
287 lblock[3] = w;
288 lblock += 4;
289 }
290 } else {
291 for (i = 0; i < 8; i++)
292 idct_col(block + i);
293 }
294 }
295
296 void simple_idct_put_axp(uint8_t *dest, int line_size, DCTELEM *block)
297 {
298 simple_idct_axp(block);
299 put_pixels_clamped(block, dest, line_size);
300 }
301
302 void simple_idct_add_axp(uint8_t *dest, int line_size, DCTELEM *block)
303 {
304 simple_idct_axp(block);
305 add_pixels_clamped(block, dest, line_size);
306 }