comparison postproc/yuv2rgb_altivec.c @ 12698:d2aef091743c

altivec yuv->rgb converter orginal patch by (Marc Hoffman <mmh at pleasantst dot com>) critical fixes by (Reza Jelveh <reza.jelveh at tu-harburg dot de>) known bugs/issues, which should be fixed ASAP by someone who has a ppc: 0..255 vs. 16..235 unneeded recalculation of tables general cleaup, like removing double initalizing of variables
author michael
date Sun, 27 Jun 2004 00:07:15 +0000
parents
children 9a310b31359f
comparison
equal deleted inserted replaced
12697:86ca4e017ac8 12698:d2aef091743c
1 /*
2 marc.hoffman@analog.com March 8, 2004
3
4 Altivec Acceleration for Color Space Conversion revision 0.2
5
6 convert I420 YV12 to RGB in various formats,
7 it rejects images that are not in 420 formats
8 it rejects images that don't have widths of multiples of 16
9 it rejects images that don't have heights of multiples of 2
10 reject defers to C simulation codes.
11
12 lots of optimizations to be done here
13
14 1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15 so we currently use max min to clip
16
17 2. the inefficient use of chroma loading needs a bit of brushing up
18
19 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
20
21
22 MODIFIED to calculate coeffs from currently selected color space.
23 MODIFIED core to be a macro which you spec the output format.
24 ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25 CORRECTED algorithim selection to be strict on input formats.
26 ADDED runtime detection of altivec.
27
28 ADDED altivec_yuv2packedX vertical scl + RGB converter
29
30 March 27,2004
31 PERFORMANCE ANALYSIS
32
33 The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34 The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
35
36 720*480*30 ~10MPS
37
38 so we have roughly 10clocks per pixel this is too high something has to be wrong.
39
40 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
41
42 OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43 guaranteed to have the input video frame it was just decompressed so
44 it probably resides in L1 caches. However we are creating the
45 output video stream this needs to use the DSTST instruction to
46 optimize for the cache. We couple this with the fact that we are
47 not going to be visiting the input buffer again so we mark it Least
48 Recently Used. This shaves 25% of the processor cycles off.
49
50 Now MEMCPY is the largest mips consumer in the system, probably due
51 to the inefficient X11 stuff.
52
53 GL libraries seem to be very slow on this machine 1.33Ghz PB running
54 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
55 a versioning issues, however i have libGL.1.2.dylib for both
56 machines. ((We need to figure this out now))
57
58 GL2 libraries work now with patch for RGB32
59
60 NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
61
62 Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
63
64 */
65 #include <stdio.h>
66 #include <stdlib.h>
67 #include <inttypes.h>
68 #include <assert.h>
69 #include "config.h"
70 #include "rgb2rgb.h"
71 #include "swscale.h"
72 #include "swscale_internal.h"
73 #include "../mangle.h"
74 #include "../libvo/img_format.h" //FIXME try to reduce dependency of such stuff
75
76 #undef PROFILE_THE_BEAST
77 #undef INC_SCALING
78
79 typedef unsigned char ubyte;
80 typedef signed char sbyte;
81
82
83 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
84 homogeneous vector registers x0,x1,x2 are interleaved with the
85 following technique:
86
87 o0 = vec_mergeh (x0,x1);
88 o1 = vec_perm (o0, x2, perm_rgb_0);
89 o2 = vec_perm (o0, x2, perm_rgb_1);
90 o3 = vec_mergel (x0,x1);
91 o4 = vec_perm (o3,o2,perm_rgb_2);
92 o5 = vec_perm (o3,o2,perm_rgb_3);
93
94 perm_rgb_0: o0(RG).h v1(B) --> o1*
95 0 1 2 3 4
96 rgbr|gbrg|brgb|rgbr
97 0010 0100 1001 0010
98 0102 3145 2673 894A
99
100 perm_rgb_1: o0(RG).h v1(B) --> o2
101 0 1 2 3 4
102 gbrg|brgb|bbbb|bbbb
103 0100 1001 1111 1111
104 B5CD 6EF7 89AB CDEF
105
106 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
107 0 1 2 3 4
108 gbrg|brgb|rgbr|gbrg
109 1111 1111 0010 0100
110 89AB CDEF 0182 3945
111
112 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
113 0 1 2 3 4
114 brgb|rgbr|gbrg|brgb
115 1001 0010 0100 1001
116 a67b 89cA BdCD eEFf
117
118 */
119 static
120 const vector unsigned char
121 perm_rgb_0 = (vector unsigned char)(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
122 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
123 perm_rgb_1 = (vector unsigned char)(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
124 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
125 perm_rgb_2 = (vector unsigned char)(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
126 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
127 perm_rgb_3 = (vector unsigned char)(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
128 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
129
130 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
131 do { \
132 typeof(x0) o0,o2,o3; \
133 o0 = vec_mergeh (x0,x1); \
134 y0 = vec_perm (o0, x2, perm_rgb_0);\
135 o2 = vec_perm (o0, x2, perm_rgb_1);\
136 o3 = vec_mergel (x0,x1); \
137 y1 = vec_perm (o3,o2,perm_rgb_2); \
138 y2 = vec_perm (o3,o2,perm_rgb_3); \
139 } while(0)
140
141 #define vec_mstrgb24(x0,x1,x2,ptr) \
142 do { \
143 typeof(x0) _0,_1,_2; \
144 vec_merge3 (x0,x1,x2,_0,_1,_2); \
145 vec_st (_0, 0, ptr++); \
146 vec_st (_1, 0, ptr++); \
147 vec_st (_2, 0, ptr++); \
148 } while (0);
149
150 #define vec_mstbgr24(x0,x1,x2,ptr) \
151 do { \
152 typeof(x0) _0,_1,_2; \
153 vec_merge3 (x2,x1,x0,_0,_1,_2); \
154 vec_st (_0, 0, ptr++); \
155 vec_st (_1, 0, ptr++); \
156 vec_st (_2, 0, ptr++); \
157 } while (0);
158
159 /* pack the pixels in rgb0 format
160 msb R
161 lsb 0
162 */
163 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
164 do { \
165 T _0,_1,_2,_3; \
166 _0 = vec_mergeh (x0,x1); \
167 _1 = vec_mergeh (x2,x3); \
168 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
169 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
170 vec_st (_2, 0*16, (T *)ptr); \
171 vec_st (_3, 1*16, (T *)ptr); \
172 _0 = vec_mergel (x0,x1); \
173 _1 = vec_mergel (x2,x3); \
174 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
175 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
176 vec_st (_2, 2*16, (T *)ptr); \
177 vec_st (_3, 3*16, (T *)ptr); \
178 ptr += 4; \
179 } while (0);
180
181 /*
182
183 | 1 0 1.4021 | | Y |
184 | 1 -0.3441 -0.7142 |x| Cb|
185 | 1 1.7718 0 | | Cr|
186
187
188 Y: [-128 127]
189 Cb/Cr : [-128 127]
190
191 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
192
193 */
194
195
196
197
198 #define vec_unh(x) \
199 (vector signed short) \
200 vec_perm(x,(typeof(x))(0),\
201 (vector unsigned char)(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
202 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
203 #define vec_unl(x) \
204 (vector signed short) \
205 vec_perm(x,(typeof(x))(0),\
206 (vector unsigned char)(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
207 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
208
209 #define vec_clip(x) \
210 vec_max (vec_min (x, (typeof(x))(255)), (typeof(x))(0))
211
212 #define vec_packclp_a(x,y) \
213 (vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
214
215 #define vec_packclp(x,y) \
216 (vector unsigned char)vec_packs \
217 ((vector unsigned short)vec_max (x,(vector signed short) (0)), \
218 (vector unsigned short)vec_max (y,(vector signed short) (0)))
219
220 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,a,a,ptr)
221
222
223 static inline cvtyuvtoRGB (SwsContext *c,
224 vector signed short Y, vector signed short U, vector signed short V,
225 vector signed short *R, vector signed short *G, vector signed short *B)
226 {
227 vector signed short vx,ux,uvx;
228
229 Y = vec_mradds (Y, c->CY, c->OY);
230
231 U = vec_sub (U,(vector signed short)(128));
232 V = vec_sub (V,(vector signed short)(128));
233
234 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
235 ux = vec_sl (U, c->CSHIFT);
236 *B = vec_mradds (ux, c->CBU, Y);
237
238 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
239 vx = vec_sl (V, c->CSHIFT);
240 *R = vec_mradds (vx, c->CRV, Y);
241
242 // uvx = ((CGU*u) + (CGV*v))>>15;
243 uvx = vec_mradds (U, c->CGU, Y);
244 *G = vec_mradds (V, c->CGV, uvx);
245 }
246
247
248 /*
249 ------------------------------------------------------------------------------
250 CS converters
251 ------------------------------------------------------------------------------
252 */
253
254
255 #define DEFCSP420_CVT(name,out_pixels) \
256 static int altivec_##name (SwsContext *c, \
257 unsigned char **in, int *instrides, \
258 int srcSliceY, int srcSliceH, \
259 unsigned char **oplanes, int *outstrides) \
260 { \
261 int w = c->srcW; \
262 int h = srcSliceH; \
263 int i,j; \
264 int instrides_scl[3]; \
265 vector unsigned char y0,y1; \
266 \
267 vector signed char u,v; \
268 \
269 vector signed short Y0,Y1,Y2,Y3; \
270 vector signed short U,V; \
271 vector signed short vx,ux,uvx; \
272 vector signed short vx0,ux0,uvx0; \
273 vector signed short vx1,ux1,uvx1; \
274 vector signed short R0,G0,B0; \
275 vector signed short R1,G1,B1; \
276 vector unsigned char R,G,B; \
277 \
278 vector unsigned char *uivP, *vivP; \
279 vector unsigned char align_perm; \
280 \
281 vector signed short \
282 lCY = c->CY, \
283 lOY = c->OY, \
284 lCRV = c->CRV, \
285 lCBU = c->CBU, \
286 lCGU = c->CGU, \
287 lCGV = c->CGV; \
288 \
289 vector unsigned short lCSHIFT = c->CSHIFT; \
290 \
291 ubyte *y1i = in[0]; \
292 ubyte *y2i = in[0]+w; \
293 ubyte *ui = in[1]; \
294 ubyte *vi = in[2]; \
295 \
296 vector unsigned char *oute \
297 = (vector unsigned char *) \
298 (oplanes[0]+srcSliceY*outstrides[0]); \
299 vector unsigned char *outo \
300 = (vector unsigned char *) \
301 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
302 \
303 \
304 instrides_scl[0] = instrides[0]; \
305 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
306 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
307 \
308 \
309 for (i=0;i<h/2;i++) { \
310 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
311 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
312 \
313 for (j=0;j<w/16;j++) { \
314 \
315 y0 = vec_ldl (0,y1i); \
316 y1 = vec_ldl (0,y2i); \
317 uivP = (vector unsigned char *)ui; \
318 vivP = (vector unsigned char *)vi; \
319 \
320 align_perm = vec_lvsl (0, ui); \
321 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \
322 \
323 align_perm = vec_lvsl (0, vi); \
324 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \
325 \
326 u = (vector signed char)vec_sub (u, (vector signed char)(128)); \
327 v = (vector signed char)vec_sub (v, (vector signed char)(128)); \
328 U = vec_unpackh (u); \
329 V = vec_unpackh (v); \
330 \
331 \
332 Y0 = vec_unh (y0); \
333 Y1 = vec_unl (y0); \
334 Y2 = vec_unh (y1); \
335 Y3 = vec_unl (y1); \
336 \
337 Y0 = vec_mradds (Y0, lCY, lOY); \
338 Y1 = vec_mradds (Y1, lCY, lOY); \
339 Y2 = vec_mradds (Y2, lCY, lOY); \
340 Y3 = vec_mradds (Y3, lCY, lOY); \
341 \
342 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
343 ux = vec_sl (U, lCSHIFT); \
344 ux = vec_mradds (ux, lCBU, (vector signed short)(0)); \
345 ux0 = vec_mergeh (ux,ux); \
346 ux1 = vec_mergel (ux,ux); \
347 \
348 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
349 vx = vec_sl (V, lCSHIFT); \
350 vx = vec_mradds (vx, lCRV, (vector signed short)(0)); \
351 vx0 = vec_mergeh (vx,vx); \
352 vx1 = vec_mergel (vx,vx); \
353 \
354 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
355 uvx = vec_mradds (U, lCGU, (vector signed short)(0)); \
356 uvx = vec_mradds (V, lCGV, uvx); \
357 uvx0 = vec_mergeh (uvx,uvx); \
358 uvx1 = vec_mergel (uvx,uvx); \
359 \
360 R0 = vec_add (Y0,vx0); \
361 G0 = vec_add (Y0,uvx0); \
362 B0 = vec_add (Y0,ux0); \
363 R1 = vec_add (Y1,vx1); \
364 G1 = vec_add (Y1,uvx1); \
365 B1 = vec_add (Y1,ux1); \
366 \
367 R = vec_packclp (R0,R1); \
368 G = vec_packclp (G0,G1); \
369 B = vec_packclp (B0,B1); \
370 \
371 out_pixels(R,G,B,oute); \
372 \
373 R0 = vec_add (Y2,vx0); \
374 G0 = vec_add (Y2,uvx0); \
375 B0 = vec_add (Y2,ux0); \
376 R1 = vec_add (Y3,vx1); \
377 G1 = vec_add (Y3,uvx1); \
378 B1 = vec_add (Y3,ux1); \
379 R = vec_packclp (R0,R1); \
380 G = vec_packclp (G0,G1); \
381 B = vec_packclp (B0,B1); \
382 \
383 \
384 out_pixels(R,G,B,outo); \
385 \
386 y1i += 16; \
387 y2i += 16; \
388 ui += 8; \
389 vi += 8; \
390 \
391 } \
392 \
393 outo += (outstrides[0])>>4; \
394 oute += (outstrides[0])>>4; \
395 \
396 ui += instrides_scl[1]; \
397 vi += instrides_scl[2]; \
398 y1i += instrides_scl[0]; \
399 y2i += instrides_scl[0]; \
400 } \
401 return srcSliceH; \
402 }
403
404
405 #define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),c,b,a,ptr)
406 #define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))(0)),ptr)
407 #define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))(0)),ptr)
408 #define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,b,c,ptr)
409 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
410 #define out_bgr24(a,b,c,ptr) vec_mstrgb24(c,b,a,ptr)
411
412 DEFCSP420_CVT (yuv2_abgr32, out_abgr)
413 DEFCSP420_CVT (yuv2_bgra32, out_argb)
414 DEFCSP420_CVT (yuv2_rgba32, out_rgba)
415 DEFCSP420_CVT (yuv2_argb32, out_argb)
416 DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
417 DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
418
419
420 // uyvy|uyvy|uyvy|uyvy
421 // 0123 4567 89ab cdef
422 static
423 const vector unsigned char
424 demux_u = (vector unsigned char)(0x10,0x00,0x10,0x00,
425 0x10,0x04,0x10,0x04,
426 0x10,0x08,0x10,0x08,
427 0x10,0x0c,0x10,0x0c),
428 demux_v = (vector unsigned char)(0x10,0x02,0x10,0x02,
429 0x10,0x06,0x10,0x06,
430 0x10,0x0A,0x10,0x0A,
431 0x10,0x0E,0x10,0x0E),
432 demux_y = (vector unsigned char)(0x10,0x01,0x10,0x03,
433 0x10,0x05,0x10,0x07,
434 0x10,0x09,0x10,0x0B,
435 0x10,0x0D,0x10,0x0F);
436
437 /*
438 this is so I can play live CCIR raw video
439 */
440 static int altivec_uyvy_rgb32 (SwsContext *c,
441 unsigned char **in, int *instrides,
442 int srcSliceY, int srcSliceH,
443 unsigned char **oplanes, int *outstrides)
444 {
445 int w = c->srcW;
446 int h = srcSliceH;
447 int i,j;
448 vector unsigned char uyvy;
449 vector signed short Y,U,V;
450 vector signed short vx,ux,uvx;
451 vector signed short R0,G0,B0,R1,G1,B1;
452 vector unsigned char R,G,B;
453 vector unsigned char *out;
454 ubyte *img;
455
456 img = in[0];
457 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
458
459 for (i=0;i<h;i++) {
460 for (j=0;j<w/16;j++) {
461 uyvy = vec_ld (0, img);
462 U = (vector signed short)
463 vec_perm (uyvy, (vector unsigned char)(0), demux_u);
464
465 V = (vector signed short)
466 vec_perm (uyvy, (vector unsigned char)(0), demux_v);
467
468 Y = (vector signed short)
469 vec_perm (uyvy, (vector unsigned char)(0), demux_y);
470
471 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
472
473 uyvy = vec_ld (16, img);
474 U = (vector signed short)
475 vec_perm (uyvy, (vector unsigned char)(0), demux_u);
476
477 V = (vector signed short)
478 vec_perm (uyvy, (vector unsigned char)(0), demux_v);
479
480 Y = (vector signed short)
481 vec_perm (uyvy, (vector unsigned char)(0), demux_y);
482
483 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
484
485 R = vec_packclp (R0,R1);
486 G = vec_packclp (G0,G1);
487 B = vec_packclp (B0,B1);
488
489 // vec_mstbgr24 (R,G,B, out);
490 out_rgba (R,G,B,out);
491
492 img += 32;
493 }
494 }
495 }
496
497
498
499 /* Ok currently the acceleration routine only supports
500 inputs of widths a multiple of 16
501 and heights a multiple 2
502
503 So we just fall back to the C codes for this.
504 */
505 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
506 {
507 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
508 return NULL;
509
510 /*
511 and this seems not to matter too much I tried a bunch of
512 videos with abnormal widths and mplayer crashes else where.
513 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
514 boom with X11 bad match.
515
516 */
517 if ((c->srcW & 0xf) != 0) return NULL;
518
519 switch (c->srcFormat) {
520 case IMGFMT_YVU9:
521 case IMGFMT_IF09:
522 case IMGFMT_YV12:
523 case IMGFMT_I420:
524 case IMGFMT_IYUV:
525 case IMGFMT_CLPL:
526 case IMGFMT_Y800:
527 case IMGFMT_Y8:
528 case IMGFMT_NV12:
529 case IMGFMT_NV21:
530 if ((c->srcH & 0x1) != 0)
531 return NULL;
532
533 switch(c->dstFormat){
534 case IMGFMT_RGB24:
535 MSG_WARN("ALTIVEC: Color Space RGB24\n");
536 return altivec_yuv2_rgb24;
537 case IMGFMT_BGR24:
538 MSG_WARN("ALTIVEC: Color Space BGR24\n");
539 return altivec_yuv2_bgr24;
540 case IMGFMT_RGB32:
541 MSG_WARN("ALTIVEC: Color Space ARGB32\n");
542 return altivec_yuv2_argb32;
543 case IMGFMT_BGR32:
544 MSG_WARN("ALTIVEC: Color Space BGRA32\n");
545 // return profile_altivec_bgra32;
546
547 return altivec_yuv2_bgra32;
548 default: return NULL;
549 }
550 break;
551
552 case IMGFMT_UYVY:
553 switch(c->dstFormat){
554 case IMGFMT_RGB32:
555 MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
556 return altivec_uyvy_rgb32;
557 case IMGFMT_RGB24:
558 case IMGFMT_BGR32:
559
560 default: return NULL;
561 }
562 break;
563
564 }
565 return NULL;
566 }
567
568
569 int yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4])
570 {
571
572 vector signed short
573 CY = (vector signed short)(0x7fff),
574 CRV = (vector signed short)(22972),
575 CBU = (vector signed short)(29029),
576 CGU = (vector signed short)(-11276),
577 CGV = (vector signed short)(-23400),
578 OY;
579
580 vector unsigned short CSHIFT = (vector unsigned short)(1);
581
582 vector signed short Y0;
583 int brightness = c->brightness, contrast = c->contrast, saturation = c->saturation;
584 int64_t crv __attribute__ ((aligned(16)));
585 int64_t cbu __attribute__ ((aligned(16)));
586 int64_t cgu __attribute__ ((aligned(16)));
587 int64_t cgv __attribute__ ((aligned(16)));
588 short tmp __attribute__ ((aligned(16)));
589
590 int64_t cy = (1<<16)-1;
591 int64_t oy = 0;
592
593 if ((c->flags & SWS_CPU_CAPS_ALTIVEC) == 0)
594 return;
595
596 crv = inv_table[0];
597 cbu = inv_table[1];
598 cgu = inv_table[2];
599 cgv = inv_table[3];
600
601 #if 0
602 printf ("crv: %hvx\n", CRV);
603 printf ("cbu: %hvx\n", CBU);
604 printf ("cgv: %hvx\n", CGV);
605 printf ("cgu: %hvx\n", CGU);
606
607 printf ("contrast: %d, brightness: %d, saturation: %d\n", contrast, brightness, saturation);
608
609 printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
610 #endif
611
612 cy = (cy *contrast )>>17;
613 crv= (crv*contrast * saturation)>>32;
614 cbu= (cbu*contrast * saturation)>>32;
615 cgu= (cgu*contrast * saturation)>>32;
616 cgv= (cgv*contrast * saturation)>>32;
617
618 oy -= 256*brightness;
619
620
621 //printf("%llx %llx %llx %llx %llx\n", cy, crv, cbu, cgu, cgv);
622
623 // vector signed short CBU,CRV,CGU,CGY,CY;
624 tmp = cy;
625 CY = vec_lde (0, &tmp);
626 CY = vec_splat (CY, 0);
627
628 tmp = oy;
629 OY = vec_lde (0, &tmp);
630 OY = vec_splat (OY, 0);
631
632 tmp = crv>>3;
633 CRV = vec_lde (0, &tmp);
634 CRV = vec_splat (CRV, 0);
635 tmp = cbu>>3;
636 CBU = vec_lde (0, &tmp);
637 CBU = vec_splat (CBU, 0);
638
639 tmp = -(cgu>>1);
640 CGU = vec_lde (0, &tmp);
641 CGU = vec_splat (CGU, 0);
642 tmp = -(cgv>>1);
643 CGV = vec_lde (0, &tmp);
644 CGV = vec_splat (CGV, 0);
645
646 CSHIFT = (vector unsigned short)(2);
647 #if 1
648 c->CSHIFT = CSHIFT;
649 c->CY = CY;
650 c->OY = OY;
651 c->CRV = CRV;
652 c->CBU = CBU;
653 c->CGU = CGU;
654 c->CGV = CGV;
655 #endif
656 #if 1
657 printf ("cy: %hvx\n", CY);
658 printf ("oy: %hvx\n", OY);
659 printf ("crv: %hvx\n", CRV);
660 printf ("cbu: %hvx\n", CBU);
661 printf ("cgv: %hvx\n", CGV);
662 printf ("cgu: %hvx\n", CGU);
663 #endif
664 }
665
666
667 void
668 altivec_yuv2packedX (SwsContext *c,
669 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
670 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
671 uint8_t *dest, int dstW, int dstY)
672 {
673 int i,j;
674 short tmp __attribute__((aligned (16)));
675 short *p;
676 short *f;
677 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
678 vector signed short R0,G0,B0,R1,G1,B1;
679
680 vector unsigned char R,G,B,pels[3];
681 vector unsigned char *out,*nout;
682 vector signed short RND = (vector signed short)(1<<3);
683 vector unsigned short SCL = (vector unsigned short)(4);
684 unsigned long scratch[16] __attribute__ ((aligned (16)));
685
686 vector signed short *vYCoeffsBank, *vCCoeffsBank;
687
688 vector signed short *YCoeffs, *CCoeffs;
689
690 vYCoeffsBank = malloc (sizeof (vector signed short)*lumFilterSize*dstW);
691 vCCoeffsBank = malloc (sizeof (vector signed short)*chrFilterSize*dstW);
692
693 for (i=0;i<lumFilterSize*dstW;i++) {
694 tmp = c->vLumFilter[i];
695 p = &vYCoeffsBank[i];
696 for (j=0;j<8;j++)
697 p[j] = tmp;
698 }
699
700 for (i=0;i<chrFilterSize*dstW;i++) {
701 tmp = c->vChrFilter[i];
702 p = &vCCoeffsBank[i];
703 for (j=0;j<8;j++)
704 p[j] = tmp;
705 }
706
707 YCoeffs = vYCoeffsBank+dstY*lumFilterSize;
708 CCoeffs = vCCoeffsBank+dstY*chrFilterSize;
709
710 out = (vector unsigned char *)dest;
711
712 for(i=0; i<dstW; i+=16){
713 Y0 = RND;
714 Y1 = RND;
715 /* extract 16 coeffs from lumSrc */
716 for(j=0; j<lumFilterSize; j++) {
717 X0 = vec_ld (0, &lumSrc[j][i]);
718 X1 = vec_ld (16, &lumSrc[j][i]);
719 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
720 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
721 }
722
723 U = RND;
724 V = RND;
725 /* extract 8 coeffs from U,V */
726 for(j=0; j<chrFilterSize; j++) {
727 X = vec_ld (0, &chrSrc[j][i/2]);
728 U = vec_mradds (X, CCoeffs[j], U);
729 X = vec_ld (0, &chrSrc[j][i/2+2048]);
730 V = vec_mradds (X, CCoeffs[j], V);
731 }
732
733 /* scale and clip signals */
734 Y0 = vec_sra (Y0, SCL);
735 Y1 = vec_sra (Y1, SCL);
736 U = vec_sra (U, SCL);
737 V = vec_sra (V, SCL);
738
739 Y0 = vec_clip (Y0);
740 Y1 = vec_clip (Y1);
741 U = vec_clip (U);
742 V = vec_clip (V);
743
744 /* now we have
745 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
746 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
747
748 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
749 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
750 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
751 */
752
753 U0 = vec_mergeh (U,U);
754 V0 = vec_mergeh (V,V);
755
756 U1 = vec_mergel (U,U);
757 V1 = vec_mergel (V,V);
758
759 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
760 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
761
762 R = vec_packclp (R0,R1);
763 G = vec_packclp (G0,G1);
764 B = vec_packclp (B0,B1);
765
766 out_rgba (R,G,B,out);
767 }
768
769 if (i < dstW) {
770 i -= 16;
771
772 Y0 = RND;
773 Y1 = RND;
774 /* extract 16 coeffs from lumSrc */
775 for(j=0; j<lumFilterSize; j++) {
776 X0 = vec_ld (0, &lumSrc[j][i]);
777 X1 = vec_ld (16, &lumSrc[j][i]);
778 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
779 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
780 }
781
782 U = RND;
783 V = RND;
784 /* extract 8 coeffs from U,V */
785 for(j=0; j<chrFilterSize; j++) {
786 X = vec_ld (0, &chrSrc[j][i/2]);
787 U = vec_mradds (X, CCoeffs[j], U);
788 X = vec_ld (0, &chrSrc[j][i/2+2048]);
789 V = vec_mradds (X, CCoeffs[j], V);
790 }
791
792 /* scale and clip signals */
793 Y0 = vec_sra (Y0, SCL);
794 Y1 = vec_sra (Y1, SCL);
795 U = vec_sra (U, SCL);
796 V = vec_sra (V, SCL);
797
798 Y0 = vec_clip (Y0);
799 Y1 = vec_clip (Y1);
800 U = vec_clip (U);
801 V = vec_clip (V);
802
803 /* now we have
804 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
805 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
806
807 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
808 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
809 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
810 */
811
812 U0 = vec_mergeh (U,U);
813 V0 = vec_mergeh (V,V);
814
815 U1 = vec_mergel (U,U);
816 V1 = vec_mergel (V,V);
817
818 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
819 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
820
821 R = vec_packclp (R0,R1);
822 G = vec_packclp (G0,G1);
823 B = vec_packclp (B0,B1);
824
825 nout = (vector unsigned char *)scratch;
826 out_rgba (R,G,B,nout);
827
828 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
829 }
830
831 if (vYCoeffsBank) free (vYCoeffsBank);
832 if (vCCoeffsBank) free (vCCoeffsBank);
833
834 }
835
836