Mercurial > mplayer.hg
annotate postproc/yuv2rgb_altivec.c @ 17557:3f863d1d8b43
vYCoeffsBank and vCCoeffsBank are allocated and initialized using incorrect
sizes based on the image width instead of height.
patch by Alan Curry, pacman at world dot std dot com
author | diego |
---|---|
date | Wed, 08 Feb 2006 08:16:53 +0000 |
parents | 08cac43f1e38 |
children | ad90899eeee6 |
rev | line source |
---|---|
12698 | 1 /* |
2 marc.hoffman@analog.com March 8, 2004 | |
3 | |
4 Altivec Acceleration for Color Space Conversion revision 0.2 | |
5 | |
6 convert I420 YV12 to RGB in various formats, | |
7 it rejects images that are not in 420 formats | |
8 it rejects images that don't have widths of multiples of 16 | |
9 it rejects images that don't have heights of multiples of 2 | |
10 reject defers to C simulation codes. | |
11 | |
12 lots of optimizations to be done here | |
13 | |
14 1. need to fix saturation code, I just couldn't get it to fly with packs and adds. | |
15 so we currently use max min to clip | |
16 | |
17 2. the inefficient use of chroma loading needs a bit of brushing up | |
18 | |
19 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls | |
20 | |
21 | |
22 MODIFIED to calculate coeffs from currently selected color space. | |
23 MODIFIED core to be a macro which you spec the output format. | |
24 ADDED UYVY conversion which is never called due to some thing in SWSCALE. | |
25 CORRECTED algorithim selection to be strict on input formats. | |
26 ADDED runtime detection of altivec. | |
27 | |
28 ADDED altivec_yuv2packedX vertical scl + RGB converter | |
29 | |
30 March 27,2004 | |
31 PERFORMANCE ANALYSIS | |
32 | |
33 The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test | |
34 The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence | |
35 | |
36 720*480*30 ~10MPS | |
37 | |
38 so we have roughly 10clocks per pixel this is too high something has to be wrong. | |
39 | |
40 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min. | |
41 | |
42 OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much | |
43 guaranteed to have the input video frame it was just decompressed so | |
44 it probably resides in L1 caches. However we are creating the | |
45 output video stream this needs to use the DSTST instruction to | |
46 optimize for the cache. We couple this with the fact that we are | |
47 not going to be visiting the input buffer again so we mark it Least | |
48 Recently Used. This shaves 25% of the processor cycles off. | |
49 | |
50 Now MEMCPY is the largest mips consumer in the system, probably due | |
51 to the inefficient X11 stuff. | |
52 | |
53 GL libraries seem to be very slow on this machine 1.33Ghz PB running | |
54 Jaguar, this is not the case for my 1Ghz PB. I thought it might be | |
55 a versioning issues, however i have libGL.1.2.dylib for both | |
56 machines. ((We need to figure this out now)) | |
57 | |
58 GL2 libraries work now with patch for RGB32 | |
59 | |
60 NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor | |
61 | |
62 Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment. | |
63 | |
64 */ | |
65 #include <stdio.h> | |
66 #include <stdlib.h> | |
12836 | 67 #include <string.h> |
12698 | 68 #include <inttypes.h> |
69 #include <assert.h> | |
70 #include "config.h" | |
71 #include "rgb2rgb.h" | |
72 #include "swscale.h" | |
73 #include "swscale_internal.h" | |
16985 | 74 #include "mangle.h" |
75 #include "libvo/img_format.h" //FIXME try to reduce dependency of such stuff | |
12698 | 76 |
77 #undef PROFILE_THE_BEAST | |
78 #undef INC_SCALING | |
79 | |
80 typedef unsigned char ubyte; | |
81 typedef signed char sbyte; | |
82 | |
83 | |
84 /* RGB interleaver, 16 planar pels 8-bit samples per channel in | |
85 homogeneous vector registers x0,x1,x2 are interleaved with the | |
86 following technique: | |
87 | |
88 o0 = vec_mergeh (x0,x1); | |
89 o1 = vec_perm (o0, x2, perm_rgb_0); | |
90 o2 = vec_perm (o0, x2, perm_rgb_1); | |
91 o3 = vec_mergel (x0,x1); | |
92 o4 = vec_perm (o3,o2,perm_rgb_2); | |
93 o5 = vec_perm (o3,o2,perm_rgb_3); | |
94 | |
95 perm_rgb_0: o0(RG).h v1(B) --> o1* | |
96 0 1 2 3 4 | |
97 rgbr|gbrg|brgb|rgbr | |
98 0010 0100 1001 0010 | |
99 0102 3145 2673 894A | |
100 | |
101 perm_rgb_1: o0(RG).h v1(B) --> o2 | |
102 0 1 2 3 4 | |
103 gbrg|brgb|bbbb|bbbb | |
104 0100 1001 1111 1111 | |
105 B5CD 6EF7 89AB CDEF | |
106 | |
107 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4* | |
108 0 1 2 3 4 | |
109 gbrg|brgb|rgbr|gbrg | |
110 1111 1111 0010 0100 | |
111 89AB CDEF 0182 3945 | |
112 | |
113 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5* | |
114 0 1 2 3 4 | |
115 brgb|rgbr|gbrg|brgb | |
116 1001 0010 0100 1001 | |
117 a67b 89cA BdCD eEFf | |
118 | |
119 */ | |
120 static | |
121 const vector unsigned char | |
13564 | 122 perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05, |
12698 | 123 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a), |
13564 | 124 perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17, |
12698 | 125 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f), |
13564 | 126 perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, |
12698 | 127 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05), |
13564 | 128 perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a, |
12698 | 129 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f); |
130 | |
131 #define vec_merge3(x2,x1,x0,y0,y1,y2) \ | |
132 do { \ | |
133 typeof(x0) o0,o2,o3; \ | |
134 o0 = vec_mergeh (x0,x1); \ | |
135 y0 = vec_perm (o0, x2, perm_rgb_0);\ | |
136 o2 = vec_perm (o0, x2, perm_rgb_1);\ | |
137 o3 = vec_mergel (x0,x1); \ | |
138 y1 = vec_perm (o3,o2,perm_rgb_2); \ | |
139 y2 = vec_perm (o3,o2,perm_rgb_3); \ | |
140 } while(0) | |
141 | |
142 #define vec_mstrgb24(x0,x1,x2,ptr) \ | |
143 do { \ | |
144 typeof(x0) _0,_1,_2; \ | |
145 vec_merge3 (x0,x1,x2,_0,_1,_2); \ | |
146 vec_st (_0, 0, ptr++); \ | |
147 vec_st (_1, 0, ptr++); \ | |
148 vec_st (_2, 0, ptr++); \ | |
149 } while (0); | |
150 | |
151 #define vec_mstbgr24(x0,x1,x2,ptr) \ | |
152 do { \ | |
153 typeof(x0) _0,_1,_2; \ | |
154 vec_merge3 (x2,x1,x0,_0,_1,_2); \ | |
155 vec_st (_0, 0, ptr++); \ | |
156 vec_st (_1, 0, ptr++); \ | |
157 vec_st (_2, 0, ptr++); \ | |
158 } while (0); | |
159 | |
160 /* pack the pixels in rgb0 format | |
161 msb R | |
162 lsb 0 | |
163 */ | |
164 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \ | |
165 do { \ | |
166 T _0,_1,_2,_3; \ | |
167 _0 = vec_mergeh (x0,x1); \ | |
168 _1 = vec_mergeh (x2,x3); \ | |
169 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ | |
170 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ | |
171 vec_st (_2, 0*16, (T *)ptr); \ | |
172 vec_st (_3, 1*16, (T *)ptr); \ | |
173 _0 = vec_mergel (x0,x1); \ | |
174 _1 = vec_mergel (x2,x3); \ | |
175 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ | |
176 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ | |
177 vec_st (_2, 2*16, (T *)ptr); \ | |
178 vec_st (_3, 3*16, (T *)ptr); \ | |
179 ptr += 4; \ | |
180 } while (0); | |
181 | |
182 /* | |
183 | |
184 | 1 0 1.4021 | | Y | | |
185 | 1 -0.3441 -0.7142 |x| Cb| | |
186 | 1 1.7718 0 | | Cr| | |
187 | |
188 | |
189 Y: [-128 127] | |
190 Cb/Cr : [-128 127] | |
191 | |
192 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode. | |
193 | |
194 */ | |
195 | |
196 | |
197 | |
198 | |
199 #define vec_unh(x) \ | |
200 (vector signed short) \ | |
13564 | 201 vec_perm(x,(typeof(x))AVV(0),\ |
202 (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\ | |
12698 | 203 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07)) |
204 #define vec_unl(x) \ | |
205 (vector signed short) \ | |
13564 | 206 vec_perm(x,(typeof(x))AVV(0),\ |
207 (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\ | |
12698 | 208 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F)) |
209 | |
210 #define vec_clip(x) \ | |
13564 | 211 vec_max (vec_min (x, (typeof(x))AVV(235)), (typeof(x))AVV(16)) |
12698 | 212 |
213 #define vec_packclp_a(x,y) \ | |
214 (vector unsigned char)vec_pack (vec_clip (x), vec_clip (y)) | |
215 | |
216 #define vec_packclp(x,y) \ | |
217 (vector unsigned char)vec_packs \ | |
13564 | 218 ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \ |
219 (vector unsigned short)vec_max (y,(vector signed short) AVV(0))) | |
12698 | 220 |
13564 | 221 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr) |
12698 | 222 |
223 | |
12836 | 224 static inline void cvtyuvtoRGB (SwsContext *c, |
12698 | 225 vector signed short Y, vector signed short U, vector signed short V, |
226 vector signed short *R, vector signed short *G, vector signed short *B) | |
227 { | |
228 vector signed short vx,ux,uvx; | |
229 | |
230 Y = vec_mradds (Y, c->CY, c->OY); | |
13564 | 231 U = vec_sub (U,(vector signed short) |
232 vec_splat((vector signed short)AVV(128),0)); | |
233 V = vec_sub (V,(vector signed short) | |
234 vec_splat((vector signed short)AVV(128),0)); | |
12698 | 235 |
236 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15; | |
237 ux = vec_sl (U, c->CSHIFT); | |
238 *B = vec_mradds (ux, c->CBU, Y); | |
239 | |
240 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15; | |
241 vx = vec_sl (V, c->CSHIFT); | |
242 *R = vec_mradds (vx, c->CRV, Y); | |
243 | |
244 // uvx = ((CGU*u) + (CGV*v))>>15; | |
245 uvx = vec_mradds (U, c->CGU, Y); | |
246 *G = vec_mradds (V, c->CGV, uvx); | |
247 } | |
248 | |
249 | |
250 /* | |
251 ------------------------------------------------------------------------------ | |
252 CS converters | |
253 ------------------------------------------------------------------------------ | |
254 */ | |
255 | |
256 | |
257 #define DEFCSP420_CVT(name,out_pixels) \ | |
258 static int altivec_##name (SwsContext *c, \ | |
259 unsigned char **in, int *instrides, \ | |
260 int srcSliceY, int srcSliceH, \ | |
261 unsigned char **oplanes, int *outstrides) \ | |
262 { \ | |
263 int w = c->srcW; \ | |
264 int h = srcSliceH; \ | |
265 int i,j; \ | |
266 int instrides_scl[3]; \ | |
267 vector unsigned char y0,y1; \ | |
268 \ | |
269 vector signed char u,v; \ | |
270 \ | |
271 vector signed short Y0,Y1,Y2,Y3; \ | |
272 vector signed short U,V; \ | |
273 vector signed short vx,ux,uvx; \ | |
274 vector signed short vx0,ux0,uvx0; \ | |
275 vector signed short vx1,ux1,uvx1; \ | |
276 vector signed short R0,G0,B0; \ | |
277 vector signed short R1,G1,B1; \ | |
278 vector unsigned char R,G,B; \ | |
279 \ | |
280 vector unsigned char *uivP, *vivP; \ | |
281 vector unsigned char align_perm; \ | |
282 \ | |
283 vector signed short \ | |
284 lCY = c->CY, \ | |
285 lOY = c->OY, \ | |
286 lCRV = c->CRV, \ | |
287 lCBU = c->CBU, \ | |
288 lCGU = c->CGU, \ | |
289 lCGV = c->CGV; \ | |
290 \ | |
291 vector unsigned short lCSHIFT = c->CSHIFT; \ | |
292 \ | |
293 ubyte *y1i = in[0]; \ | |
294 ubyte *y2i = in[0]+w; \ | |
295 ubyte *ui = in[1]; \ | |
296 ubyte *vi = in[2]; \ | |
297 \ | |
298 vector unsigned char *oute \ | |
299 = (vector unsigned char *) \ | |
300 (oplanes[0]+srcSliceY*outstrides[0]); \ | |
301 vector unsigned char *outo \ | |
302 = (vector unsigned char *) \ | |
303 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \ | |
304 \ | |
305 \ | |
306 instrides_scl[0] = instrides[0]; \ | |
307 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \ | |
308 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \ | |
309 \ | |
310 \ | |
311 for (i=0;i<h/2;i++) { \ | |
312 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \ | |
313 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \ | |
314 \ | |
315 for (j=0;j<w/16;j++) { \ | |
316 \ | |
317 y0 = vec_ldl (0,y1i); \ | |
318 y1 = vec_ldl (0,y2i); \ | |
319 uivP = (vector unsigned char *)ui; \ | |
320 vivP = (vector unsigned char *)vi; \ | |
321 \ | |
322 align_perm = vec_lvsl (0, ui); \ | |
323 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \ | |
324 \ | |
325 align_perm = vec_lvsl (0, vi); \ | |
326 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \ | |
327 \ | |
13564 | 328 u = (vector signed char) \ |
329 vec_sub (u,(vector signed char) \ | |
330 vec_splat((vector signed char)AVV(128),0));\ | |
331 v = (vector signed char) \ | |
332 vec_sub (v,(vector signed char) \ | |
333 vec_splat((vector signed char)AVV(128),0));\ | |
334 \ | |
12698 | 335 U = vec_unpackh (u); \ |
336 V = vec_unpackh (v); \ | |
337 \ | |
338 \ | |
339 Y0 = vec_unh (y0); \ | |
340 Y1 = vec_unl (y0); \ | |
341 Y2 = vec_unh (y1); \ | |
342 Y3 = vec_unl (y1); \ | |
343 \ | |
344 Y0 = vec_mradds (Y0, lCY, lOY); \ | |
345 Y1 = vec_mradds (Y1, lCY, lOY); \ | |
346 Y2 = vec_mradds (Y2, lCY, lOY); \ | |
347 Y3 = vec_mradds (Y3, lCY, lOY); \ | |
348 \ | |
349 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \ | |
350 ux = vec_sl (U, lCSHIFT); \ | |
13564 | 351 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \ |
12698 | 352 ux0 = vec_mergeh (ux,ux); \ |
353 ux1 = vec_mergel (ux,ux); \ | |
354 \ | |
355 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \ | |
356 vx = vec_sl (V, lCSHIFT); \ | |
13564 | 357 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \ |
12698 | 358 vx0 = vec_mergeh (vx,vx); \ |
359 vx1 = vec_mergel (vx,vx); \ | |
360 \ | |
361 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \ | |
13564 | 362 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \ |
12698 | 363 uvx = vec_mradds (V, lCGV, uvx); \ |
364 uvx0 = vec_mergeh (uvx,uvx); \ | |
365 uvx1 = vec_mergel (uvx,uvx); \ | |
366 \ | |
367 R0 = vec_add (Y0,vx0); \ | |
368 G0 = vec_add (Y0,uvx0); \ | |
369 B0 = vec_add (Y0,ux0); \ | |
370 R1 = vec_add (Y1,vx1); \ | |
371 G1 = vec_add (Y1,uvx1); \ | |
372 B1 = vec_add (Y1,ux1); \ | |
373 \ | |
374 R = vec_packclp (R0,R1); \ | |
375 G = vec_packclp (G0,G1); \ | |
376 B = vec_packclp (B0,B1); \ | |
377 \ | |
378 out_pixels(R,G,B,oute); \ | |
379 \ | |
380 R0 = vec_add (Y2,vx0); \ | |
381 G0 = vec_add (Y2,uvx0); \ | |
382 B0 = vec_add (Y2,ux0); \ | |
383 R1 = vec_add (Y3,vx1); \ | |
384 G1 = vec_add (Y3,uvx1); \ | |
385 B1 = vec_add (Y3,ux1); \ | |
386 R = vec_packclp (R0,R1); \ | |
387 G = vec_packclp (G0,G1); \ | |
388 B = vec_packclp (B0,B1); \ | |
389 \ | |
390 \ | |
391 out_pixels(R,G,B,outo); \ | |
392 \ | |
393 y1i += 16; \ | |
394 y2i += 16; \ | |
395 ui += 8; \ | |
396 vi += 8; \ | |
397 \ | |
398 } \ | |
399 \ | |
400 outo += (outstrides[0])>>4; \ | |
401 oute += (outstrides[0])>>4; \ | |
402 \ | |
403 ui += instrides_scl[1]; \ | |
404 vi += instrides_scl[2]; \ | |
405 y1i += instrides_scl[0]; \ | |
406 y2i += instrides_scl[0]; \ | |
407 } \ | |
408 return srcSliceH; \ | |
409 } | |
410 | |
411 | |
13564 | 412 #define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr) |
413 #define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr) | |
414 #define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr) | |
415 #define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr) | |
12698 | 416 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr) |
13564 | 417 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(c,b,a,ptr) |
12698 | 418 |
419 DEFCSP420_CVT (yuv2_abgr32, out_abgr) | |
13564 | 420 #if 1 |
12698 | 421 DEFCSP420_CVT (yuv2_bgra32, out_argb) |
13564 | 422 #else |
423 static int altivec_yuv2_bgra32 (SwsContext *c, | |
424 unsigned char **in, int *instrides, | |
425 int srcSliceY, int srcSliceH, | |
426 unsigned char **oplanes, int *outstrides) | |
427 { | |
428 int w = c->srcW; | |
429 int h = srcSliceH; | |
430 int i,j; | |
431 int instrides_scl[3]; | |
432 vector unsigned char y0,y1; | |
433 | |
434 vector signed char u,v; | |
435 | |
436 vector signed short Y0,Y1,Y2,Y3; | |
437 vector signed short U,V; | |
438 vector signed short vx,ux,uvx; | |
439 vector signed short vx0,ux0,uvx0; | |
440 vector signed short vx1,ux1,uvx1; | |
441 vector signed short R0,G0,B0; | |
442 vector signed short R1,G1,B1; | |
443 vector unsigned char R,G,B; | |
444 | |
445 vector unsigned char *uivP, *vivP; | |
446 vector unsigned char align_perm; | |
447 | |
448 vector signed short | |
449 lCY = c->CY, | |
450 lOY = c->OY, | |
451 lCRV = c->CRV, | |
452 lCBU = c->CBU, | |
453 lCGU = c->CGU, | |
454 lCGV = c->CGV; | |
455 | |
456 vector unsigned short lCSHIFT = c->CSHIFT; | |
457 | |
458 ubyte *y1i = in[0]; | |
459 ubyte *y2i = in[0]+w; | |
460 ubyte *ui = in[1]; | |
461 ubyte *vi = in[2]; | |
462 | |
463 vector unsigned char *oute | |
464 = (vector unsigned char *) | |
465 (oplanes[0]+srcSliceY*outstrides[0]); | |
466 vector unsigned char *outo | |
467 = (vector unsigned char *) | |
468 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); | |
469 | |
470 | |
471 instrides_scl[0] = instrides[0]; | |
472 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ | |
473 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ | |
474 | |
475 | |
476 for (i=0;i<h/2;i++) { | |
477 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); | |
478 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); | |
479 | |
480 for (j=0;j<w/16;j++) { | |
481 | |
482 y0 = vec_ldl (0,y1i); | |
483 y1 = vec_ldl (0,y2i); | |
484 uivP = (vector unsigned char *)ui; | |
485 vivP = (vector unsigned char *)vi; | |
486 | |
487 align_perm = vec_lvsl (0, ui); | |
488 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); | |
489 | |
490 align_perm = vec_lvsl (0, vi); | |
491 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); | |
492 u = (vector signed char) | |
493 vec_sub (u,(vector signed char) | |
494 vec_splat((vector signed char)AVV(128),0)); | |
495 | |
496 v = (vector signed char) | |
497 vec_sub (v, (vector signed char) | |
498 vec_splat((vector signed char)AVV(128),0)); | |
499 | |
500 U = vec_unpackh (u); | |
501 V = vec_unpackh (v); | |
502 | |
503 | |
504 Y0 = vec_unh (y0); | |
505 Y1 = vec_unl (y0); | |
506 Y2 = vec_unh (y1); | |
507 Y3 = vec_unl (y1); | |
508 | |
509 Y0 = vec_mradds (Y0, lCY, lOY); | |
510 Y1 = vec_mradds (Y1, lCY, lOY); | |
511 Y2 = vec_mradds (Y2, lCY, lOY); | |
512 Y3 = vec_mradds (Y3, lCY, lOY); | |
513 | |
514 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ | |
515 ux = vec_sl (U, lCSHIFT); | |
516 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); | |
517 ux0 = vec_mergeh (ux,ux); | |
518 ux1 = vec_mergel (ux,ux); | |
519 | |
520 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ | |
521 vx = vec_sl (V, lCSHIFT); | |
522 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); | |
523 vx0 = vec_mergeh (vx,vx); | |
524 vx1 = vec_mergel (vx,vx); | |
525 /* uvx = ((CGU*u) + (CGV*v))>>15 */ | |
526 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); | |
527 uvx = vec_mradds (V, lCGV, uvx); | |
528 uvx0 = vec_mergeh (uvx,uvx); | |
529 uvx1 = vec_mergel (uvx,uvx); | |
530 R0 = vec_add (Y0,vx0); | |
531 G0 = vec_add (Y0,uvx0); | |
532 B0 = vec_add (Y0,ux0); | |
533 R1 = vec_add (Y1,vx1); | |
534 G1 = vec_add (Y1,uvx1); | |
535 B1 = vec_add (Y1,ux1); | |
536 R = vec_packclp (R0,R1); | |
537 G = vec_packclp (G0,G1); | |
538 B = vec_packclp (B0,B1); | |
539 | |
540 out_argb(R,G,B,oute); | |
541 R0 = vec_add (Y2,vx0); | |
542 G0 = vec_add (Y2,uvx0); | |
543 B0 = vec_add (Y2,ux0); | |
544 R1 = vec_add (Y3,vx1); | |
545 G1 = vec_add (Y3,uvx1); | |
546 B1 = vec_add (Y3,ux1); | |
547 R = vec_packclp (R0,R1); | |
548 G = vec_packclp (G0,G1); | |
549 B = vec_packclp (B0,B1); | |
550 | |
551 out_argb(R,G,B,outo); | |
552 y1i += 16; | |
553 y2i += 16; | |
554 ui += 8; | |
555 vi += 8; | |
556 | |
557 } | |
558 | |
559 outo += (outstrides[0])>>4; | |
560 oute += (outstrides[0])>>4; | |
561 | |
562 ui += instrides_scl[1]; | |
563 vi += instrides_scl[2]; | |
564 y1i += instrides_scl[0]; | |
565 y2i += instrides_scl[0]; | |
566 } | |
567 return srcSliceH; | |
568 } | |
569 | |
570 #endif | |
571 | |
572 | |
12698 | 573 DEFCSP420_CVT (yuv2_rgba32, out_rgba) |
574 DEFCSP420_CVT (yuv2_argb32, out_argb) | |
575 DEFCSP420_CVT (yuv2_rgb24, out_rgb24) | |
576 DEFCSP420_CVT (yuv2_bgr24, out_bgr24) | |
577 | |
578 | |
579 // uyvy|uyvy|uyvy|uyvy | |
580 // 0123 4567 89ab cdef | |
581 static | |
582 const vector unsigned char | |
13564 | 583 demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00, |
12698 | 584 0x10,0x04,0x10,0x04, |
585 0x10,0x08,0x10,0x08, | |
586 0x10,0x0c,0x10,0x0c), | |
13564 | 587 demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02, |
12698 | 588 0x10,0x06,0x10,0x06, |
589 0x10,0x0A,0x10,0x0A, | |
590 0x10,0x0E,0x10,0x0E), | |
13564 | 591 demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03, |
12698 | 592 0x10,0x05,0x10,0x07, |
593 0x10,0x09,0x10,0x0B, | |
594 0x10,0x0D,0x10,0x0F); | |
595 | |
596 /* | |
597 this is so I can play live CCIR raw video | |
598 */ | |
599 static int altivec_uyvy_rgb32 (SwsContext *c, | |
600 unsigned char **in, int *instrides, | |
601 int srcSliceY, int srcSliceH, | |
602 unsigned char **oplanes, int *outstrides) | |
603 { | |
604 int w = c->srcW; | |
605 int h = srcSliceH; | |
606 int i,j; | |
607 vector unsigned char uyvy; | |
608 vector signed short Y,U,V; | |
609 vector signed short vx,ux,uvx; | |
610 vector signed short R0,G0,B0,R1,G1,B1; | |
611 vector unsigned char R,G,B; | |
612 vector unsigned char *out; | |
613 ubyte *img; | |
614 | |
615 img = in[0]; | |
616 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]); | |
617 | |
618 for (i=0;i<h;i++) { | |
619 for (j=0;j<w/16;j++) { | |
620 uyvy = vec_ld (0, img); | |
621 U = (vector signed short) | |
13564 | 622 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u); |
12698 | 623 |
624 V = (vector signed short) | |
13564 | 625 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v); |
12698 | 626 |
627 Y = (vector signed short) | |
13564 | 628 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y); |
12698 | 629 |
630 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0); | |
631 | |
632 uyvy = vec_ld (16, img); | |
633 U = (vector signed short) | |
13564 | 634 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u); |
12698 | 635 |
636 V = (vector signed short) | |
13564 | 637 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v); |
12698 | 638 |
639 Y = (vector signed short) | |
13564 | 640 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y); |
12698 | 641 |
642 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1); | |
643 | |
644 R = vec_packclp (R0,R1); | |
645 G = vec_packclp (G0,G1); | |
646 B = vec_packclp (B0,B1); | |
647 | |
648 // vec_mstbgr24 (R,G,B, out); | |
649 out_rgba (R,G,B,out); | |
650 | |
651 img += 32; | |
652 } | |
653 } | |
12836 | 654 return srcSliceH; |
12698 | 655 } |
656 | |
657 | |
658 | |
659 /* Ok currently the acceleration routine only supports | |
660 inputs of widths a multiple of 16 | |
661 and heights a multiple 2 | |
662 | |
663 So we just fall back to the C codes for this. | |
664 */ | |
665 SwsFunc yuv2rgb_init_altivec (SwsContext *c) | |
666 { | |
667 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC)) | |
668 return NULL; | |
669 | |
670 /* | |
671 and this seems not to matter too much I tried a bunch of | |
672 videos with abnormal widths and mplayer crashes else where. | |
673 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv | |
674 boom with X11 bad match. | |
675 | |
676 */ | |
677 if ((c->srcW & 0xf) != 0) return NULL; | |
678 | |
679 switch (c->srcFormat) { | |
680 case IMGFMT_YVU9: | |
681 case IMGFMT_IF09: | |
682 case IMGFMT_YV12: | |
683 case IMGFMT_I420: | |
684 case IMGFMT_IYUV: | |
685 case IMGFMT_CLPL: | |
686 case IMGFMT_Y800: | |
687 case IMGFMT_Y8: | |
688 case IMGFMT_NV12: | |
689 case IMGFMT_NV21: | |
690 if ((c->srcH & 0x1) != 0) | |
691 return NULL; | |
692 | |
693 switch(c->dstFormat){ | |
694 case IMGFMT_RGB24: | |
695 MSG_WARN("ALTIVEC: Color Space RGB24\n"); | |
696 return altivec_yuv2_rgb24; | |
697 case IMGFMT_BGR24: | |
698 MSG_WARN("ALTIVEC: Color Space BGR24\n"); | |
699 return altivec_yuv2_bgr24; | |
700 case IMGFMT_RGB32: | |
701 MSG_WARN("ALTIVEC: Color Space ARGB32\n"); | |
702 return altivec_yuv2_argb32; | |
703 case IMGFMT_BGR32: | |
704 MSG_WARN("ALTIVEC: Color Space BGRA32\n"); | |
705 // return profile_altivec_bgra32; | |
706 | |
707 return altivec_yuv2_bgra32; | |
708 default: return NULL; | |
709 } | |
710 break; | |
711 | |
712 case IMGFMT_UYVY: | |
713 switch(c->dstFormat){ | |
714 case IMGFMT_RGB32: | |
715 MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n"); | |
716 return altivec_uyvy_rgb32; | |
717 default: return NULL; | |
718 } | |
719 break; | |
720 | |
721 } | |
722 return NULL; | |
723 } | |
724 | |
13564 | 725 static uint16_t roundToInt16(int64_t f){ |
726 int r= (f + (1<<15))>>16; | |
727 if(r<-0x7FFF) return 0x8000; | |
728 else if(r> 0x7FFF) return 0x7FFF; | |
729 else return r; | |
730 } | |
12698 | 731 |
13564 | 732 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation) |
733 { | |
734 union { | |
735 signed short tmp[8] __attribute__ ((aligned(16))); | |
736 vector signed short vec; | |
737 } buf; | |
12698 | 738 |
13564 | 739 buf.tmp[0] = ( (0xffffLL) * contrast>>8 )>>9; //cy |
740 buf.tmp[1] = -256*brightness; //oy | |
741 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv | |
742 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu | |
743 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu | |
744 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv | |
12698 | 745 |
746 | |
13564 | 747 c->CSHIFT = (vector unsigned short)vec_splat((vector unsigned short)AVV(2),0); |
748 c->CY = vec_splat ((vector signed short)buf.vec, 0); | |
749 c->OY = vec_splat ((vector signed short)buf.vec, 1); | |
750 c->CRV = vec_splat ((vector signed short)buf.vec, 2); | |
751 c->CBU = vec_splat ((vector signed short)buf.vec, 3); | |
752 c->CGU = vec_splat ((vector signed short)buf.vec, 4); | |
753 c->CGV = vec_splat ((vector signed short)buf.vec, 5); | |
12836 | 754 #if 0 |
13564 | 755 { |
756 int i; | |
757 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"}; | |
758 for (i=0; i<6;i++) | |
759 printf("%s %d ", v[i],buf.tmp[i] ); | |
760 printf("\n"); | |
761 } | |
12698 | 762 #endif |
12836 | 763 return; |
12698 | 764 } |
765 | |
766 | |
767 void | |
768 altivec_yuv2packedX (SwsContext *c, | |
769 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | |
770 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
771 uint8_t *dest, int dstW, int dstY) | |
772 { | |
773 int i,j; | |
774 short tmp __attribute__((aligned (16))); | |
13564 | 775 int16_t *p; |
12698 | 776 short *f; |
777 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V; | |
778 vector signed short R0,G0,B0,R1,G1,B1; | |
779 | |
780 vector unsigned char R,G,B,pels[3]; | |
781 vector unsigned char *out,*nout; | |
13564 | 782 |
783 vector signed short RND = vec_splat((vector signed short)AVV(1<<3),0); | |
784 vector unsigned short SCL = vec_splat((vector unsigned short)AVV(4),0); | |
12698 | 785 unsigned long scratch[16] __attribute__ ((aligned (16))); |
786 | |
787 vector signed short *vYCoeffsBank, *vCCoeffsBank; | |
788 | |
789 vector signed short *YCoeffs, *CCoeffs; | |
790 | |
17557
3f863d1d8b43
vYCoeffsBank and vCCoeffsBank are allocated and initialized using incorrect
diego
parents:
16985
diff
changeset
|
791 vYCoeffsBank = malloc (sizeof (vector signed short)*lumFilterSize*c->dstH); |
3f863d1d8b43
vYCoeffsBank and vCCoeffsBank are allocated and initialized using incorrect
diego
parents:
16985
diff
changeset
|
792 vCCoeffsBank = malloc (sizeof (vector signed short)*chrFilterSize*c->dstH); |
12698 | 793 |
17557
3f863d1d8b43
vYCoeffsBank and vCCoeffsBank are allocated and initialized using incorrect
diego
parents:
16985
diff
changeset
|
794 for (i=0;i<lumFilterSize*c->dstH;i++) { |
12698 | 795 tmp = c->vLumFilter[i]; |
796 p = &vYCoeffsBank[i]; | |
797 for (j=0;j<8;j++) | |
798 p[j] = tmp; | |
799 } | |
800 | |
17557
3f863d1d8b43
vYCoeffsBank and vCCoeffsBank are allocated and initialized using incorrect
diego
parents:
16985
diff
changeset
|
801 for (i=0;i<chrFilterSize*c->dstH;i++) { |
12698 | 802 tmp = c->vChrFilter[i]; |
803 p = &vCCoeffsBank[i]; | |
804 for (j=0;j<8;j++) | |
805 p[j] = tmp; | |
806 } | |
807 | |
808 YCoeffs = vYCoeffsBank+dstY*lumFilterSize; | |
809 CCoeffs = vCCoeffsBank+dstY*chrFilterSize; | |
810 | |
811 out = (vector unsigned char *)dest; | |
812 | |
813 for(i=0; i<dstW; i+=16){ | |
814 Y0 = RND; | |
815 Y1 = RND; | |
816 /* extract 16 coeffs from lumSrc */ | |
817 for(j=0; j<lumFilterSize; j++) { | |
818 X0 = vec_ld (0, &lumSrc[j][i]); | |
819 X1 = vec_ld (16, &lumSrc[j][i]); | |
820 Y0 = vec_mradds (X0, YCoeffs[j], Y0); | |
821 Y1 = vec_mradds (X1, YCoeffs[j], Y1); | |
822 } | |
823 | |
824 U = RND; | |
825 V = RND; | |
826 /* extract 8 coeffs from U,V */ | |
827 for(j=0; j<chrFilterSize; j++) { | |
828 X = vec_ld (0, &chrSrc[j][i/2]); | |
829 U = vec_mradds (X, CCoeffs[j], U); | |
830 X = vec_ld (0, &chrSrc[j][i/2+2048]); | |
831 V = vec_mradds (X, CCoeffs[j], V); | |
832 } | |
833 | |
834 /* scale and clip signals */ | |
835 Y0 = vec_sra (Y0, SCL); | |
836 Y1 = vec_sra (Y1, SCL); | |
837 U = vec_sra (U, SCL); | |
838 V = vec_sra (V, SCL); | |
839 | |
840 Y0 = vec_clip (Y0); | |
841 Y1 = vec_clip (Y1); | |
842 U = vec_clip (U); | |
843 V = vec_clip (V); | |
844 | |
845 /* now we have | |
846 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 | |
847 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7 | |
848 | |
849 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 | |
850 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 | |
851 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 | |
852 */ | |
853 | |
854 U0 = vec_mergeh (U,U); | |
855 V0 = vec_mergeh (V,V); | |
856 | |
857 U1 = vec_mergel (U,U); | |
858 V1 = vec_mergel (V,V); | |
859 | |
860 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); | |
861 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); | |
862 | |
863 R = vec_packclp (R0,R1); | |
864 G = vec_packclp (G0,G1); | |
865 B = vec_packclp (B0,B1); | |
866 | |
867 out_rgba (R,G,B,out); | |
868 } | |
869 | |
870 if (i < dstW) { | |
871 i -= 16; | |
872 | |
873 Y0 = RND; | |
874 Y1 = RND; | |
875 /* extract 16 coeffs from lumSrc */ | |
876 for(j=0; j<lumFilterSize; j++) { | |
877 X0 = vec_ld (0, &lumSrc[j][i]); | |
878 X1 = vec_ld (16, &lumSrc[j][i]); | |
879 Y0 = vec_mradds (X0, YCoeffs[j], Y0); | |
880 Y1 = vec_mradds (X1, YCoeffs[j], Y1); | |
881 } | |
882 | |
883 U = RND; | |
884 V = RND; | |
885 /* extract 8 coeffs from U,V */ | |
886 for(j=0; j<chrFilterSize; j++) { | |
887 X = vec_ld (0, &chrSrc[j][i/2]); | |
888 U = vec_mradds (X, CCoeffs[j], U); | |
889 X = vec_ld (0, &chrSrc[j][i/2+2048]); | |
890 V = vec_mradds (X, CCoeffs[j], V); | |
891 } | |
892 | |
893 /* scale and clip signals */ | |
894 Y0 = vec_sra (Y0, SCL); | |
895 Y1 = vec_sra (Y1, SCL); | |
896 U = vec_sra (U, SCL); | |
897 V = vec_sra (V, SCL); | |
898 | |
899 Y0 = vec_clip (Y0); | |
900 Y1 = vec_clip (Y1); | |
901 U = vec_clip (U); | |
902 V = vec_clip (V); | |
903 | |
904 /* now we have | |
905 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 | |
906 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7 | |
907 | |
908 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 | |
909 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 | |
910 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 | |
911 */ | |
912 | |
913 U0 = vec_mergeh (U,U); | |
914 V0 = vec_mergeh (V,V); | |
915 | |
916 U1 = vec_mergel (U,U); | |
917 V1 = vec_mergel (V,V); | |
918 | |
919 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); | |
920 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); | |
921 | |
922 R = vec_packclp (R0,R1); | |
923 G = vec_packclp (G0,G1); | |
924 B = vec_packclp (B0,B1); | |
925 | |
926 nout = (vector unsigned char *)scratch; | |
927 out_rgba (R,G,B,nout); | |
928 | |
929 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4); | |
930 } | |
931 | |
932 if (vYCoeffsBank) free (vYCoeffsBank); | |
933 if (vCCoeffsBank) free (vCCoeffsBank); | |
934 | |
935 } |