Mercurial > mplayer.hg
annotate libmpeg2/idct_alpha.c @ 18693:a4a6b2cf5022
Do not use border for bicubic filter helper texture, since it will cause ATI
cards to switch to software mode and be unusable. Double texture size to avoid
this causing artefacts.
Note: yuv=6 will not be changed, so it will stay unusable with ATI cards unless
ATI starts supporting this.
author | reimar |
---|---|
date | Tue, 13 Jun 2006 19:55:20 +0000 |
parents | 1385ec491ffb |
children | 0783dd397f74 |
rev | line source |
---|---|
9857 | 1 /* |
2 * idct_alpha.c | |
12932 | 3 * Copyright (C) 2002-2003 Falk Hueffner <falk@debian.org> |
4 * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org> | |
9857 | 5 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> |
6 * | |
7 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. | |
8 * See http://libmpeg2.sourceforge.net/ for updates. | |
9 * | |
10 * mpeg2dec is free software; you can redistribute it and/or modify | |
11 * it under the terms of the GNU General Public License as published by | |
12 * the Free Software Foundation; either version 2 of the License, or | |
13 * (at your option) any later version. | |
14 * | |
15 * mpeg2dec is distributed in the hope that it will be useful, | |
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 * GNU General Public License for more details. | |
19 * | |
20 * You should have received a copy of the GNU General Public License | |
21 * along with this program; if not, write to the Free Software | |
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
14732
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
12932
diff
changeset
|
23 * |
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
12932
diff
changeset
|
24 * Modified for use with MPlayer, see libmpeg-0.4.0.diff for the exact changes. |
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
12932
diff
changeset
|
25 * detailed CVS changelog at http://www.mplayerhq.hu/cgi-bin/cvsweb.cgi/main/ |
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
12932
diff
changeset
|
26 * $Id$ |
9857 | 27 */ |
28 | |
29 #include "config.h" | |
30 | |
31 #ifdef ARCH_ALPHA | |
32 | |
33 #include <stdlib.h> | |
34 #include <inttypes.h> | |
35 | |
12932 | 36 #include "mpeg2.h" |
37 #include "attributes.h" | |
38 #include "mpeg2_internal.h" | |
9857 | 39 #include "alpha_asm.h" |
40 | |
12932 | 41 #define W1 2841 /* 2048 * sqrt (2) * cos (1 * pi / 16) */ |
42 #define W2 2676 /* 2048 * sqrt (2) * cos (2 * pi / 16) */ | |
43 #define W3 2408 /* 2048 * sqrt (2) * cos (3 * pi / 16) */ | |
44 #define W5 1609 /* 2048 * sqrt (2) * cos (5 * pi / 16) */ | |
45 #define W6 1108 /* 2048 * sqrt (2) * cos (6 * pi / 16) */ | |
46 #define W7 565 /* 2048 * sqrt (2) * cos (7 * pi / 16) */ | |
9857 | 47 |
12932 | 48 extern uint8_t mpeg2_clip[3840 * 2 + 256]; |
49 #define CLIP(i) ((mpeg2_clip + 3840)[i]) | |
9857 | 50 |
51 #if 0 | |
52 #define BUTTERFLY(t0,t1,W0,W1,d0,d1) \ | |
53 do { \ | |
12932 | 54 t0 = W0 * d0 + W1 * d1; \ |
55 t1 = W0 * d1 - W1 * d0; \ | |
9857 | 56 } while (0) |
57 #else | |
58 #define BUTTERFLY(t0,t1,W0,W1,d0,d1) \ | |
59 do { \ | |
60 int_fast32_t tmp = W0 * (d0 + d1); \ | |
61 t0 = tmp + (W1 - W0) * d1; \ | |
62 t1 = tmp - (W1 + W0) * d0; \ | |
63 } while (0) | |
64 #endif | |
65 | |
10392 | 66 static inline void idct_row (int16_t * const block) |
9857 | 67 { |
68 uint64_t l, r; | |
69 int_fast32_t d0, d1, d2, d3; | |
70 int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3; | |
71 int_fast32_t t0, t1, t2, t3; | |
72 | |
73 l = ldq (block); | |
74 r = ldq (block + 4); | |
75 | |
76 /* shortcut */ | |
77 if (likely (!((l & ~0xffffUL) | r))) { | |
12932 | 78 uint64_t tmp = (uint16_t) (l >> 1); |
9857 | 79 tmp |= tmp << 16; |
80 tmp |= tmp << 32; | |
81 ((int32_t *)block)[0] = tmp; | |
82 ((int32_t *)block)[1] = tmp; | |
83 ((int32_t *)block)[2] = tmp; | |
84 ((int32_t *)block)[3] = tmp; | |
85 return; | |
86 } | |
87 | |
12932 | 88 d0 = (sextw (l) << 11) + 2048; |
9857 | 89 d1 = sextw (extwl (l, 2)); |
90 d2 = sextw (extwl (l, 4)) << 11; | |
91 d3 = sextw (extwl (l, 6)); | |
92 t0 = d0 + d2; | |
93 t1 = d0 - d2; | |
94 BUTTERFLY (t2, t3, W6, W2, d3, d1); | |
95 a0 = t0 + t2; | |
96 a1 = t1 + t3; | |
97 a2 = t1 - t3; | |
98 a3 = t0 - t2; | |
99 | |
100 d0 = sextw (r); | |
101 d1 = sextw (extwl (r, 2)); | |
102 d2 = sextw (extwl (r, 4)); | |
103 d3 = sextw (extwl (r, 6)); | |
104 BUTTERFLY (t0, t1, W7, W1, d3, d0); | |
105 BUTTERFLY (t2, t3, W3, W5, d1, d2); | |
106 b0 = t0 + t2; | |
107 b3 = t1 + t3; | |
108 t0 -= t2; | |
109 t1 -= t3; | |
12932 | 110 b1 = ((t0 + t1) >> 8) * 181; |
111 b2 = ((t0 - t1) >> 8) * 181; | |
9857 | 112 |
12932 | 113 block[0] = (a0 + b0) >> 12; |
114 block[1] = (a1 + b1) >> 12; | |
115 block[2] = (a2 + b2) >> 12; | |
116 block[3] = (a3 + b3) >> 12; | |
117 block[4] = (a3 - b3) >> 12; | |
118 block[5] = (a2 - b2) >> 12; | |
119 block[6] = (a1 - b1) >> 12; | |
120 block[7] = (a0 - b0) >> 12; | |
9857 | 121 } |
122 | |
10392 | 123 static inline void idct_col (int16_t * const block) |
9857 | 124 { |
125 int_fast32_t d0, d1, d2, d3; | |
126 int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3; | |
127 int_fast32_t t0, t1, t2, t3; | |
128 | |
129 d0 = (block[8*0] << 11) + 65536; | |
130 d1 = block[8*1]; | |
131 d2 = block[8*2] << 11; | |
132 d3 = block[8*3]; | |
133 t0 = d0 + d2; | |
134 t1 = d0 - d2; | |
135 BUTTERFLY (t2, t3, W6, W2, d3, d1); | |
136 a0 = t0 + t2; | |
137 a1 = t1 + t3; | |
138 a2 = t1 - t3; | |
139 a3 = t0 - t2; | |
140 | |
141 d0 = block[8*4]; | |
142 d1 = block[8*5]; | |
143 d2 = block[8*6]; | |
144 d3 = block[8*7]; | |
145 BUTTERFLY (t0, t1, W7, W1, d3, d0); | |
146 BUTTERFLY (t2, t3, W3, W5, d1, d2); | |
147 b0 = t0 + t2; | |
148 b3 = t1 + t3; | |
12932 | 149 t0 -= t2; |
150 t1 -= t3; | |
151 b1 = ((t0 + t1) >> 8) * 181; | |
152 b2 = ((t0 - t1) >> 8) * 181; | |
9857 | 153 |
154 block[8*0] = (a0 + b0) >> 17; | |
155 block[8*1] = (a1 + b1) >> 17; | |
156 block[8*2] = (a2 + b2) >> 17; | |
157 block[8*3] = (a3 + b3) >> 17; | |
158 block[8*4] = (a3 - b3) >> 17; | |
159 block[8*5] = (a2 - b2) >> 17; | |
160 block[8*6] = (a1 - b1) >> 17; | |
161 block[8*7] = (a0 - b0) >> 17; | |
162 } | |
163 | |
10488 | 164 #ifdef CAN_COMPILE_ALPHA_MVI |
9857 | 165 void mpeg2_idct_copy_mvi (int16_t * block, uint8_t * dest, const int stride) |
166 { | |
167 uint64_t clampmask; | |
168 int i; | |
169 | |
170 for (i = 0; i < 8; i++) | |
171 idct_row (block + 8 * i); | |
172 | |
173 for (i = 0; i < 8; i++) | |
174 idct_col (block + i); | |
175 | |
176 clampmask = zap (-1, 0xaa); /* 0x00ff00ff00ff00ff */ | |
177 do { | |
178 uint64_t shorts0, shorts1; | |
179 | |
180 shorts0 = ldq (block); | |
181 shorts0 = maxsw4 (shorts0, 0); | |
182 shorts0 = minsw4 (shorts0, clampmask); | |
183 stl (pkwb (shorts0), dest); | |
184 | |
185 shorts1 = ldq (block + 4); | |
186 shorts1 = maxsw4 (shorts1, 0); | |
187 shorts1 = minsw4 (shorts1, clampmask); | |
188 stl (pkwb (shorts1), dest + 4); | |
189 | |
190 stq (0, block); | |
191 stq (0, block + 4); | |
192 | |
193 dest += stride; | |
194 block += 8; | |
195 } while (--i); | |
196 } | |
197 | |
198 void mpeg2_idct_add_mvi (const int last, int16_t * block, | |
199 uint8_t * dest, const int stride) | |
200 { | |
201 uint64_t clampmask; | |
202 uint64_t signmask; | |
203 int i; | |
204 | |
12932 | 205 if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { |
9857 | 206 for (i = 0; i < 8; i++) |
207 idct_row (block + 8 * i); | |
208 for (i = 0; i < 8; i++) | |
209 idct_col (block + i); | |
210 clampmask = zap (-1, 0xaa); /* 0x00ff00ff00ff00ff */ | |
211 signmask = zap (-1, 0x33); | |
212 signmask ^= signmask >> 1; /* 0x8000800080008000 */ | |
213 | |
214 do { | |
215 uint64_t shorts0, pix0, signs0; | |
216 uint64_t shorts1, pix1, signs1; | |
217 | |
218 shorts0 = ldq (block); | |
219 shorts1 = ldq (block + 4); | |
220 | |
221 pix0 = unpkbw (ldl (dest)); | |
222 /* signed subword add (MMX paddw). */ | |
223 signs0 = shorts0 & signmask; | |
224 shorts0 &= ~signmask; | |
225 shorts0 += pix0; | |
226 shorts0 ^= signs0; | |
227 /* clamp. */ | |
228 shorts0 = maxsw4 (shorts0, 0); | |
229 shorts0 = minsw4 (shorts0, clampmask); | |
230 | |
231 /* next 4. */ | |
232 pix1 = unpkbw (ldl (dest + 4)); | |
233 signs1 = shorts1 & signmask; | |
234 shorts1 &= ~signmask; | |
235 shorts1 += pix1; | |
236 shorts1 ^= signs1; | |
237 shorts1 = maxsw4 (shorts1, 0); | |
238 shorts1 = minsw4 (shorts1, clampmask); | |
239 | |
240 stl (pkwb (shorts0), dest); | |
241 stl (pkwb (shorts1), dest + 4); | |
242 stq (0, block); | |
243 stq (0, block + 4); | |
244 | |
245 dest += stride; | |
246 block += 8; | |
247 } while (--i); | |
248 } else { | |
249 int DC; | |
250 uint64_t p0, p1, p2, p3, p4, p5, p6, p7; | |
251 uint64_t DCs; | |
252 | |
12932 | 253 DC = (block[0] + 64) >> 7; |
9857 | 254 block[0] = block[63] = 0; |
255 | |
256 p0 = ldq (dest + 0 * stride); | |
257 p1 = ldq (dest + 1 * stride); | |
258 p2 = ldq (dest + 2 * stride); | |
259 p3 = ldq (dest + 3 * stride); | |
260 p4 = ldq (dest + 4 * stride); | |
261 p5 = ldq (dest + 5 * stride); | |
262 p6 = ldq (dest + 6 * stride); | |
263 p7 = ldq (dest + 7 * stride); | |
264 | |
265 if (DC > 0) { | |
266 DCs = BYTE_VEC (likely (DC <= 255) ? DC : 255); | |
267 p0 += minub8 (DCs, ~p0); | |
268 p1 += minub8 (DCs, ~p1); | |
269 p2 += minub8 (DCs, ~p2); | |
270 p3 += minub8 (DCs, ~p3); | |
271 p4 += minub8 (DCs, ~p4); | |
272 p5 += minub8 (DCs, ~p5); | |
273 p6 += minub8 (DCs, ~p6); | |
274 p7 += minub8 (DCs, ~p7); | |
275 } else { | |
276 DCs = BYTE_VEC (likely (-DC <= 255) ? -DC : 255); | |
277 p0 -= minub8 (DCs, p0); | |
278 p1 -= minub8 (DCs, p1); | |
279 p2 -= minub8 (DCs, p2); | |
280 p3 -= minub8 (DCs, p3); | |
281 p4 -= minub8 (DCs, p4); | |
282 p5 -= minub8 (DCs, p5); | |
283 p6 -= minub8 (DCs, p6); | |
284 p7 -= minub8 (DCs, p7); | |
285 } | |
286 | |
287 stq (p0, dest + 0 * stride); | |
288 stq (p1, dest + 1 * stride); | |
289 stq (p2, dest + 2 * stride); | |
290 stq (p3, dest + 3 * stride); | |
291 stq (p4, dest + 4 * stride); | |
292 stq (p5, dest + 5 * stride); | |
293 stq (p6, dest + 6 * stride); | |
294 stq (p7, dest + 7 * stride); | |
295 } | |
296 } | |
10488 | 297 #endif |
9857 | 298 |
299 void mpeg2_idct_copy_alpha (int16_t * block, uint8_t * dest, const int stride) | |
300 { | |
301 int i; | |
302 | |
303 for (i = 0; i < 8; i++) | |
304 idct_row (block + 8 * i); | |
305 for (i = 0; i < 8; i++) | |
306 idct_col (block + i); | |
307 do { | |
308 dest[0] = CLIP (block[0]); | |
309 dest[1] = CLIP (block[1]); | |
310 dest[2] = CLIP (block[2]); | |
311 dest[3] = CLIP (block[3]); | |
312 dest[4] = CLIP (block[4]); | |
313 dest[5] = CLIP (block[5]); | |
314 dest[6] = CLIP (block[6]); | |
315 dest[7] = CLIP (block[7]); | |
316 | |
317 stq(0, block); | |
318 stq(0, block + 4); | |
319 | |
320 dest += stride; | |
321 block += 8; | |
322 } while (--i); | |
323 } | |
324 | |
325 void mpeg2_idct_add_alpha (const int last, int16_t * block, | |
326 uint8_t * dest, const int stride) | |
327 { | |
328 int i; | |
329 | |
12932 | 330 if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { |
9857 | 331 for (i = 0; i < 8; i++) |
332 idct_row (block + 8 * i); | |
333 for (i = 0; i < 8; i++) | |
334 idct_col (block + i); | |
335 do { | |
336 dest[0] = CLIP (block[0] + dest[0]); | |
337 dest[1] = CLIP (block[1] + dest[1]); | |
338 dest[2] = CLIP (block[2] + dest[2]); | |
339 dest[3] = CLIP (block[3] + dest[3]); | |
340 dest[4] = CLIP (block[4] + dest[4]); | |
341 dest[5] = CLIP (block[5] + dest[5]); | |
342 dest[6] = CLIP (block[6] + dest[6]); | |
343 dest[7] = CLIP (block[7] + dest[7]); | |
344 | |
345 stq(0, block); | |
346 stq(0, block + 4); | |
347 | |
348 dest += stride; | |
349 block += 8; | |
350 } while (--i); | |
351 } else { | |
352 int DC; | |
353 | |
12932 | 354 DC = (block[0] + 64) >> 7; |
9857 | 355 block[0] = block[63] = 0; |
356 i = 8; | |
357 do { | |
358 dest[0] = CLIP (DC + dest[0]); | |
359 dest[1] = CLIP (DC + dest[1]); | |
360 dest[2] = CLIP (DC + dest[2]); | |
361 dest[3] = CLIP (DC + dest[3]); | |
362 dest[4] = CLIP (DC + dest[4]); | |
363 dest[5] = CLIP (DC + dest[5]); | |
364 dest[6] = CLIP (DC + dest[6]); | |
365 dest[7] = CLIP (DC + dest[7]); | |
366 dest += stride; | |
367 } while (--i); | |
368 } | |
369 } | |
370 | |
12932 | 371 void mpeg2_idct_alpha_init (void) |
9857 | 372 { |
373 extern uint8_t mpeg2_scan_norm[64]; | |
374 extern uint8_t mpeg2_scan_alt[64]; | |
375 int i, j; | |
376 | |
377 for (i = 0; i < 64; i++) { | |
378 j = mpeg2_scan_norm[i]; | |
379 mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2); | |
380 j = mpeg2_scan_alt[i]; | |
381 mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2); | |
382 } | |
383 } | |
384 | |
385 #endif /* ARCH_ALPHA */ |