comparison libswscale/ppc/yuv2rgb_altivec.c @ 29028:f8db50af4035

Move yuv2rgb code to subdirs.
author ramiro
date Thu, 26 Mar 2009 01:30:10 +0000
parents
children 882a1f5613e1
comparison
equal deleted inserted replaced
29027:9c1508cdb122 29028:f8db50af4035
1 /*
2 * AltiVec acceleration for colorspace conversion
3 *
4 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 /*
24 Convert I420 YV12 to RGB in various formats,
25 it rejects images that are not in 420 formats,
26 it rejects images that don't have widths of multiples of 16,
27 it rejects images that don't have heights of multiples of 2.
28 Reject defers to C simulation code.
29
30 Lots of optimizations to be done here.
31
32 1. Need to fix saturation code. I just couldn't get it to fly with packs
33 and adds, so we currently use max/min to clip.
34
35 2. The inefficient use of chroma loading needs a bit of brushing up.
36
37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
38 pipeline stalls.
39
40
41 MODIFIED to calculate coeffs from currently selected color space.
42 MODIFIED core to be a macro where you specify the output format.
43 ADDED UYVY conversion which is never called due to some thing in swscale.
44 CORRECTED algorithim selection to be strict on input formats.
45 ADDED runtime detection of AltiVec.
46
47 ADDED altivec_yuv2packedX vertical scl + RGB converter
48
49 March 27,2004
50 PERFORMANCE ANALYSIS
51
52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53 used as test.
54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55 same sequence.
56
57 720 * 480 * 30 ~10MPS
58
59 so we have roughly 10 clocks per pixel. This is too high, something has
60 to be wrong.
61
62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63 need for vec_min.
64
65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66 the input video frame, it was just decompressed so it probably resides in L1
67 caches. However, we are creating the output video stream. This needs to use the
68 DSTST instruction to optimize for the cache. We couple this with the fact that
69 we are not going to be visiting the input buffer again so we mark it Least
70 Recently Used. This shaves 25% of the processor cycles off.
71
72 Now memcpy is the largest mips consumer in the system, probably due
73 to the inefficient X11 stuff.
74
75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
76 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
77 a versioning issue, however I have libGL.1.2.dylib for both
78 machines. (We need to figure this out now.)
79
80 GL2 libraries work now with patch for RGB32.
81
82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
83
84 Integrated luma prescaling adjustment for saturation/contrast/brightness
85 adjustment.
86 */
87
88 #include <stdio.h>
89 #include <stdlib.h>
90 #include <string.h>
91 #include <inttypes.h>
92 #include <assert.h>
93 #include "config.h"
94 #include "libswscale/rgb2rgb.h"
95 #include "libswscale/swscale.h"
96 #include "libswscale/swscale_internal.h"
97
98 #undef PROFILE_THE_BEAST
99 #undef INC_SCALING
100
101 typedef unsigned char ubyte;
102 typedef signed char sbyte;
103
104
105 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
106 homogeneous vector registers x0,x1,x2 are interleaved with the
107 following technique:
108
109 o0 = vec_mergeh (x0,x1);
110 o1 = vec_perm (o0, x2, perm_rgb_0);
111 o2 = vec_perm (o0, x2, perm_rgb_1);
112 o3 = vec_mergel (x0,x1);
113 o4 = vec_perm (o3,o2,perm_rgb_2);
114 o5 = vec_perm (o3,o2,perm_rgb_3);
115
116 perm_rgb_0: o0(RG).h v1(B) --> o1*
117 0 1 2 3 4
118 rgbr|gbrg|brgb|rgbr
119 0010 0100 1001 0010
120 0102 3145 2673 894A
121
122 perm_rgb_1: o0(RG).h v1(B) --> o2
123 0 1 2 3 4
124 gbrg|brgb|bbbb|bbbb
125 0100 1001 1111 1111
126 B5CD 6EF7 89AB CDEF
127
128 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
129 0 1 2 3 4
130 gbrg|brgb|rgbr|gbrg
131 1111 1111 0010 0100
132 89AB CDEF 0182 3945
133
134 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
135 0 1 2 3 4
136 brgb|rgbr|gbrg|brgb
137 1001 0010 0100 1001
138 a67b 89cA BdCD eEFf
139
140 */
141 static
142 const vector unsigned char
143 perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
144 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
145 perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
146 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
147 perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
148 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
149 perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
150 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
151
152 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
153 do { \
154 __typeof__(x0) o0,o2,o3; \
155 o0 = vec_mergeh (x0,x1); \
156 y0 = vec_perm (o0, x2, perm_rgb_0); \
157 o2 = vec_perm (o0, x2, perm_rgb_1); \
158 o3 = vec_mergel (x0,x1); \
159 y1 = vec_perm (o3,o2,perm_rgb_2); \
160 y2 = vec_perm (o3,o2,perm_rgb_3); \
161 } while(0)
162
163 #define vec_mstbgr24(x0,x1,x2,ptr) \
164 do { \
165 __typeof__(x0) _0,_1,_2; \
166 vec_merge3 (x0,x1,x2,_0,_1,_2); \
167 vec_st (_0, 0, ptr++); \
168 vec_st (_1, 0, ptr++); \
169 vec_st (_2, 0, ptr++); \
170 } while (0);
171
172 #define vec_mstrgb24(x0,x1,x2,ptr) \
173 do { \
174 __typeof__(x0) _0,_1,_2; \
175 vec_merge3 (x2,x1,x0,_0,_1,_2); \
176 vec_st (_0, 0, ptr++); \
177 vec_st (_1, 0, ptr++); \
178 vec_st (_2, 0, ptr++); \
179 } while (0);
180
181 /* pack the pixels in rgb0 format
182 msb R
183 lsb 0
184 */
185 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
186 do { \
187 T _0,_1,_2,_3; \
188 _0 = vec_mergeh (x0,x1); \
189 _1 = vec_mergeh (x2,x3); \
190 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
191 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
192 vec_st (_2, 0*16, (T *)ptr); \
193 vec_st (_3, 1*16, (T *)ptr); \
194 _0 = vec_mergel (x0,x1); \
195 _1 = vec_mergel (x2,x3); \
196 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
197 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
198 vec_st (_2, 2*16, (T *)ptr); \
199 vec_st (_3, 3*16, (T *)ptr); \
200 ptr += 4; \
201 } while (0);
202
203 /*
204
205 | 1 0 1.4021 | | Y |
206 | 1 -0.3441 -0.7142 |x| Cb|
207 | 1 1.7718 0 | | Cr|
208
209
210 Y: [-128 127]
211 Cb/Cr : [-128 127]
212
213 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
214
215 */
216
217
218
219
220 #define vec_unh(x) \
221 (vector signed short) \
222 vec_perm(x,(__typeof__(x)){0}, \
223 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
224 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
225 #define vec_unl(x) \
226 (vector signed short) \
227 vec_perm(x,(__typeof__(x)){0}, \
228 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
229 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
230
231 #define vec_clip_s16(x) \
232 vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
233 ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
234
235 #define vec_packclp(x,y) \
236 (vector unsigned char)vec_packs \
237 ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
238 (vector unsigned short)vec_max (y,((vector signed short) {0})))
239
240 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
241
242
243 static inline void cvtyuvtoRGB (SwsContext *c,
244 vector signed short Y, vector signed short U, vector signed short V,
245 vector signed short *R, vector signed short *G, vector signed short *B)
246 {
247 vector signed short vx,ux,uvx;
248
249 Y = vec_mradds (Y, c->CY, c->OY);
250 U = vec_sub (U,(vector signed short)
251 vec_splat((vector signed short){128},0));
252 V = vec_sub (V,(vector signed short)
253 vec_splat((vector signed short){128},0));
254
255 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
256 ux = vec_sl (U, c->CSHIFT);
257 *B = vec_mradds (ux, c->CBU, Y);
258
259 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
260 vx = vec_sl (V, c->CSHIFT);
261 *R = vec_mradds (vx, c->CRV, Y);
262
263 // uvx = ((CGU*u) + (CGV*v))>>15;
264 uvx = vec_mradds (U, c->CGU, Y);
265 *G = vec_mradds (V, c->CGV, uvx);
266 }
267
268
269 /*
270 ------------------------------------------------------------------------------
271 CS converters
272 ------------------------------------------------------------------------------
273 */
274
275
276 #define DEFCSP420_CVT(name,out_pixels) \
277 static int altivec_##name (SwsContext *c, \
278 unsigned char **in, int *instrides, \
279 int srcSliceY, int srcSliceH, \
280 unsigned char **oplanes, int *outstrides) \
281 { \
282 int w = c->srcW; \
283 int h = srcSliceH; \
284 int i,j; \
285 int instrides_scl[3]; \
286 vector unsigned char y0,y1; \
287 \
288 vector signed char u,v; \
289 \
290 vector signed short Y0,Y1,Y2,Y3; \
291 vector signed short U,V; \
292 vector signed short vx,ux,uvx; \
293 vector signed short vx0,ux0,uvx0; \
294 vector signed short vx1,ux1,uvx1; \
295 vector signed short R0,G0,B0; \
296 vector signed short R1,G1,B1; \
297 vector unsigned char R,G,B; \
298 \
299 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
300 vector unsigned char align_perm; \
301 \
302 vector signed short \
303 lCY = c->CY, \
304 lOY = c->OY, \
305 lCRV = c->CRV, \
306 lCBU = c->CBU, \
307 lCGU = c->CGU, \
308 lCGV = c->CGV; \
309 \
310 vector unsigned short lCSHIFT = c->CSHIFT; \
311 \
312 ubyte *y1i = in[0]; \
313 ubyte *y2i = in[0]+instrides[0]; \
314 ubyte *ui = in[1]; \
315 ubyte *vi = in[2]; \
316 \
317 vector unsigned char *oute \
318 = (vector unsigned char *) \
319 (oplanes[0]+srcSliceY*outstrides[0]); \
320 vector unsigned char *outo \
321 = (vector unsigned char *) \
322 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
323 \
324 \
325 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \
326 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
327 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
328 \
329 \
330 for (i=0;i<h/2;i++) { \
331 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
332 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
333 \
334 for (j=0;j<w/16;j++) { \
335 \
336 y1ivP = (vector unsigned char *)y1i; \
337 y2ivP = (vector unsigned char *)y2i; \
338 uivP = (vector unsigned char *)ui; \
339 vivP = (vector unsigned char *)vi; \
340 \
341 align_perm = vec_lvsl (0, y1i); \
342 y0 = (vector unsigned char) \
343 vec_perm (y1ivP[0], y1ivP[1], align_perm); \
344 \
345 align_perm = vec_lvsl (0, y2i); \
346 y1 = (vector unsigned char) \
347 vec_perm (y2ivP[0], y2ivP[1], align_perm); \
348 \
349 align_perm = vec_lvsl (0, ui); \
350 u = (vector signed char) \
351 vec_perm (uivP[0], uivP[1], align_perm); \
352 \
353 align_perm = vec_lvsl (0, vi); \
354 v = (vector signed char) \
355 vec_perm (vivP[0], vivP[1], align_perm); \
356 \
357 u = (vector signed char) \
358 vec_sub (u,(vector signed char) \
359 vec_splat((vector signed char){128},0)); \
360 v = (vector signed char) \
361 vec_sub (v,(vector signed char) \
362 vec_splat((vector signed char){128},0)); \
363 \
364 U = vec_unpackh (u); \
365 V = vec_unpackh (v); \
366 \
367 \
368 Y0 = vec_unh (y0); \
369 Y1 = vec_unl (y0); \
370 Y2 = vec_unh (y1); \
371 Y3 = vec_unl (y1); \
372 \
373 Y0 = vec_mradds (Y0, lCY, lOY); \
374 Y1 = vec_mradds (Y1, lCY, lOY); \
375 Y2 = vec_mradds (Y2, lCY, lOY); \
376 Y3 = vec_mradds (Y3, lCY, lOY); \
377 \
378 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
379 ux = vec_sl (U, lCSHIFT); \
380 ux = vec_mradds (ux, lCBU, (vector signed short){0}); \
381 ux0 = vec_mergeh (ux,ux); \
382 ux1 = vec_mergel (ux,ux); \
383 \
384 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
385 vx = vec_sl (V, lCSHIFT); \
386 vx = vec_mradds (vx, lCRV, (vector signed short){0}); \
387 vx0 = vec_mergeh (vx,vx); \
388 vx1 = vec_mergel (vx,vx); \
389 \
390 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
391 uvx = vec_mradds (U, lCGU, (vector signed short){0}); \
392 uvx = vec_mradds (V, lCGV, uvx); \
393 uvx0 = vec_mergeh (uvx,uvx); \
394 uvx1 = vec_mergel (uvx,uvx); \
395 \
396 R0 = vec_add (Y0,vx0); \
397 G0 = vec_add (Y0,uvx0); \
398 B0 = vec_add (Y0,ux0); \
399 R1 = vec_add (Y1,vx1); \
400 G1 = vec_add (Y1,uvx1); \
401 B1 = vec_add (Y1,ux1); \
402 \
403 R = vec_packclp (R0,R1); \
404 G = vec_packclp (G0,G1); \
405 B = vec_packclp (B0,B1); \
406 \
407 out_pixels(R,G,B,oute); \
408 \
409 R0 = vec_add (Y2,vx0); \
410 G0 = vec_add (Y2,uvx0); \
411 B0 = vec_add (Y2,ux0); \
412 R1 = vec_add (Y3,vx1); \
413 G1 = vec_add (Y3,uvx1); \
414 B1 = vec_add (Y3,ux1); \
415 R = vec_packclp (R0,R1); \
416 G = vec_packclp (G0,G1); \
417 B = vec_packclp (B0,B1); \
418 \
419 \
420 out_pixels(R,G,B,outo); \
421 \
422 y1i += 16; \
423 y2i += 16; \
424 ui += 8; \
425 vi += 8; \
426 \
427 } \
428 \
429 outo += (outstrides[0])>>4; \
430 oute += (outstrides[0])>>4; \
431 \
432 ui += instrides_scl[1]; \
433 vi += instrides_scl[2]; \
434 y1i += instrides_scl[0]; \
435 y2i += instrides_scl[0]; \
436 } \
437 return srcSliceH; \
438 }
439
440
441 #define out_abgr(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
442 #define out_bgra(a,b,c,ptr) vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
443 #define out_rgba(a,b,c,ptr) vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
444 #define out_argb(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
445 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
446 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
447
448 DEFCSP420_CVT (yuv2_abgr, out_abgr)
449 #if 1
450 DEFCSP420_CVT (yuv2_bgra, out_bgra)
451 #else
452 static int altivec_yuv2_bgra32 (SwsContext *c,
453 unsigned char **in, int *instrides,
454 int srcSliceY, int srcSliceH,
455 unsigned char **oplanes, int *outstrides)
456 {
457 int w = c->srcW;
458 int h = srcSliceH;
459 int i,j;
460 int instrides_scl[3];
461 vector unsigned char y0,y1;
462
463 vector signed char u,v;
464
465 vector signed short Y0,Y1,Y2,Y3;
466 vector signed short U,V;
467 vector signed short vx,ux,uvx;
468 vector signed short vx0,ux0,uvx0;
469 vector signed short vx1,ux1,uvx1;
470 vector signed short R0,G0,B0;
471 vector signed short R1,G1,B1;
472 vector unsigned char R,G,B;
473
474 vector unsigned char *uivP, *vivP;
475 vector unsigned char align_perm;
476
477 vector signed short
478 lCY = c->CY,
479 lOY = c->OY,
480 lCRV = c->CRV,
481 lCBU = c->CBU,
482 lCGU = c->CGU,
483 lCGV = c->CGV;
484
485 vector unsigned short lCSHIFT = c->CSHIFT;
486
487 ubyte *y1i = in[0];
488 ubyte *y2i = in[0]+w;
489 ubyte *ui = in[1];
490 ubyte *vi = in[2];
491
492 vector unsigned char *oute
493 = (vector unsigned char *)
494 (oplanes[0]+srcSliceY*outstrides[0]);
495 vector unsigned char *outo
496 = (vector unsigned char *)
497 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
498
499
500 instrides_scl[0] = instrides[0];
501 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */
502 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */
503
504
505 for (i=0;i<h/2;i++) {
506 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
507 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
508
509 for (j=0;j<w/16;j++) {
510
511 y0 = vec_ldl (0,y1i);
512 y1 = vec_ldl (0,y2i);
513 uivP = (vector unsigned char *)ui;
514 vivP = (vector unsigned char *)vi;
515
516 align_perm = vec_lvsl (0, ui);
517 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
518
519 align_perm = vec_lvsl (0, vi);
520 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
521 u = (vector signed char)
522 vec_sub (u,(vector signed char)
523 vec_splat((vector signed char){128},0));
524
525 v = (vector signed char)
526 vec_sub (v, (vector signed char)
527 vec_splat((vector signed char){128},0));
528
529 U = vec_unpackh (u);
530 V = vec_unpackh (v);
531
532
533 Y0 = vec_unh (y0);
534 Y1 = vec_unl (y0);
535 Y2 = vec_unh (y1);
536 Y3 = vec_unl (y1);
537
538 Y0 = vec_mradds (Y0, lCY, lOY);
539 Y1 = vec_mradds (Y1, lCY, lOY);
540 Y2 = vec_mradds (Y2, lCY, lOY);
541 Y3 = vec_mradds (Y3, lCY, lOY);
542
543 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
544 ux = vec_sl (U, lCSHIFT);
545 ux = vec_mradds (ux, lCBU, (vector signed short){0});
546 ux0 = vec_mergeh (ux,ux);
547 ux1 = vec_mergel (ux,ux);
548
549 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
550 vx = vec_sl (V, lCSHIFT);
551 vx = vec_mradds (vx, lCRV, (vector signed short){0});
552 vx0 = vec_mergeh (vx,vx);
553 vx1 = vec_mergel (vx,vx);
554 /* uvx = ((CGU*u) + (CGV*v))>>15 */
555 uvx = vec_mradds (U, lCGU, (vector signed short){0});
556 uvx = vec_mradds (V, lCGV, uvx);
557 uvx0 = vec_mergeh (uvx,uvx);
558 uvx1 = vec_mergel (uvx,uvx);
559 R0 = vec_add (Y0,vx0);
560 G0 = vec_add (Y0,uvx0);
561 B0 = vec_add (Y0,ux0);
562 R1 = vec_add (Y1,vx1);
563 G1 = vec_add (Y1,uvx1);
564 B1 = vec_add (Y1,ux1);
565 R = vec_packclp (R0,R1);
566 G = vec_packclp (G0,G1);
567 B = vec_packclp (B0,B1);
568
569 out_argb(R,G,B,oute);
570 R0 = vec_add (Y2,vx0);
571 G0 = vec_add (Y2,uvx0);
572 B0 = vec_add (Y2,ux0);
573 R1 = vec_add (Y3,vx1);
574 G1 = vec_add (Y3,uvx1);
575 B1 = vec_add (Y3,ux1);
576 R = vec_packclp (R0,R1);
577 G = vec_packclp (G0,G1);
578 B = vec_packclp (B0,B1);
579
580 out_argb(R,G,B,outo);
581 y1i += 16;
582 y2i += 16;
583 ui += 8;
584 vi += 8;
585
586 }
587
588 outo += (outstrides[0])>>4;
589 oute += (outstrides[0])>>4;
590
591 ui += instrides_scl[1];
592 vi += instrides_scl[2];
593 y1i += instrides_scl[0];
594 y2i += instrides_scl[0];
595 }
596 return srcSliceH;
597 }
598
599 #endif
600
601
602 DEFCSP420_CVT (yuv2_rgba, out_rgba)
603 DEFCSP420_CVT (yuv2_argb, out_argb)
604 DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
605 DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
606
607
608 // uyvy|uyvy|uyvy|uyvy
609 // 0123 4567 89ab cdef
610 static
611 const vector unsigned char
612 demux_u = {0x10,0x00,0x10,0x00,
613 0x10,0x04,0x10,0x04,
614 0x10,0x08,0x10,0x08,
615 0x10,0x0c,0x10,0x0c},
616 demux_v = {0x10,0x02,0x10,0x02,
617 0x10,0x06,0x10,0x06,
618 0x10,0x0A,0x10,0x0A,
619 0x10,0x0E,0x10,0x0E},
620 demux_y = {0x10,0x01,0x10,0x03,
621 0x10,0x05,0x10,0x07,
622 0x10,0x09,0x10,0x0B,
623 0x10,0x0D,0x10,0x0F};
624
625 /*
626 this is so I can play live CCIR raw video
627 */
628 static int altivec_uyvy_rgb32 (SwsContext *c,
629 unsigned char **in, int *instrides,
630 int srcSliceY, int srcSliceH,
631 unsigned char **oplanes, int *outstrides)
632 {
633 int w = c->srcW;
634 int h = srcSliceH;
635 int i,j;
636 vector unsigned char uyvy;
637 vector signed short Y,U,V;
638 vector signed short R0,G0,B0,R1,G1,B1;
639 vector unsigned char R,G,B;
640 vector unsigned char *out;
641 ubyte *img;
642
643 img = in[0];
644 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
645
646 for (i=0;i<h;i++) {
647 for (j=0;j<w/16;j++) {
648 uyvy = vec_ld (0, img);
649 U = (vector signed short)
650 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
651
652 V = (vector signed short)
653 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
654
655 Y = (vector signed short)
656 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
657
658 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
659
660 uyvy = vec_ld (16, img);
661 U = (vector signed short)
662 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
663
664 V = (vector signed short)
665 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
666
667 Y = (vector signed short)
668 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
669
670 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
671
672 R = vec_packclp (R0,R1);
673 G = vec_packclp (G0,G1);
674 B = vec_packclp (B0,B1);
675
676 // vec_mstbgr24 (R,G,B, out);
677 out_rgba (R,G,B,out);
678
679 img += 32;
680 }
681 }
682 return srcSliceH;
683 }
684
685
686
687 /* Ok currently the acceleration routine only supports
688 inputs of widths a multiple of 16
689 and heights a multiple 2
690
691 So we just fall back to the C codes for this.
692 */
693 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
694 {
695 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
696 return NULL;
697
698 /*
699 and this seems not to matter too much I tried a bunch of
700 videos with abnormal widths and MPlayer crashes elsewhere.
701 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
702 boom with X11 bad match.
703
704 */
705 if ((c->srcW & 0xf) != 0) return NULL;
706
707 switch (c->srcFormat) {
708 case PIX_FMT_YUV410P:
709 case PIX_FMT_YUV420P:
710 /*case IMGFMT_CLPL: ??? */
711 case PIX_FMT_GRAY8:
712 case PIX_FMT_NV12:
713 case PIX_FMT_NV21:
714 if ((c->srcH & 0x1) != 0)
715 return NULL;
716
717 switch(c->dstFormat){
718 case PIX_FMT_RGB24:
719 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
720 return altivec_yuv2_rgb24;
721 case PIX_FMT_BGR24:
722 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
723 return altivec_yuv2_bgr24;
724 case PIX_FMT_ARGB:
725 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
726 return altivec_yuv2_argb;
727 case PIX_FMT_ABGR:
728 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
729 return altivec_yuv2_abgr;
730 case PIX_FMT_RGBA:
731 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
732 return altivec_yuv2_rgba;
733 case PIX_FMT_BGRA:
734 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
735 return altivec_yuv2_bgra;
736 default: return NULL;
737 }
738 break;
739
740 case PIX_FMT_UYVY422:
741 switch(c->dstFormat){
742 case PIX_FMT_BGR32:
743 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
744 return altivec_uyvy_rgb32;
745 default: return NULL;
746 }
747 break;
748
749 }
750 return NULL;
751 }
752
753 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
754 {
755 union {
756 signed short tmp[8] __attribute__ ((aligned(16)));
757 vector signed short vec;
758 } buf;
759
760 buf.tmp[0] = ((0xffffLL) * contrast>>8)>>9; //cy
761 buf.tmp[1] = -256*brightness; //oy
762 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv
763 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu
764 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu
765 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv
766
767
768 c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
769 c->CY = vec_splat ((vector signed short)buf.vec, 0);
770 c->OY = vec_splat ((vector signed short)buf.vec, 1);
771 c->CRV = vec_splat ((vector signed short)buf.vec, 2);
772 c->CBU = vec_splat ((vector signed short)buf.vec, 3);
773 c->CGU = vec_splat ((vector signed short)buf.vec, 4);
774 c->CGV = vec_splat ((vector signed short)buf.vec, 5);
775 #if 0
776 {
777 int i;
778 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
779 for (i=0; i<6; i++)
780 printf("%s %d ", v[i],buf.tmp[i] );
781 printf("\n");
782 }
783 #endif
784 return;
785 }
786
787
788 void
789 ff_yuv2packedX_altivec(SwsContext *c,
790 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
791 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
792 uint8_t *dest, int dstW, int dstY)
793 {
794 int i,j;
795 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
796 vector signed short R0,G0,B0,R1,G1,B1;
797
798 vector unsigned char R,G,B;
799 vector unsigned char *out,*nout;
800
801 vector signed short RND = vec_splat_s16(1<<3);
802 vector unsigned short SCL = vec_splat_u16(4);
803 unsigned long scratch[16] __attribute__ ((aligned (16)));
804
805 vector signed short *YCoeffs, *CCoeffs;
806
807 YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
808 CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
809
810 out = (vector unsigned char *)dest;
811
812 for (i=0; i<dstW; i+=16){
813 Y0 = RND;
814 Y1 = RND;
815 /* extract 16 coeffs from lumSrc */
816 for (j=0; j<lumFilterSize; j++) {
817 X0 = vec_ld (0, &lumSrc[j][i]);
818 X1 = vec_ld (16, &lumSrc[j][i]);
819 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
820 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
821 }
822
823 U = RND;
824 V = RND;
825 /* extract 8 coeffs from U,V */
826 for (j=0; j<chrFilterSize; j++) {
827 X = vec_ld (0, &chrSrc[j][i/2]);
828 U = vec_mradds (X, CCoeffs[j], U);
829 X = vec_ld (0, &chrSrc[j][i/2+2048]);
830 V = vec_mradds (X, CCoeffs[j], V);
831 }
832
833 /* scale and clip signals */
834 Y0 = vec_sra (Y0, SCL);
835 Y1 = vec_sra (Y1, SCL);
836 U = vec_sra (U, SCL);
837 V = vec_sra (V, SCL);
838
839 Y0 = vec_clip_s16 (Y0);
840 Y1 = vec_clip_s16 (Y1);
841 U = vec_clip_s16 (U);
842 V = vec_clip_s16 (V);
843
844 /* now we have
845 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
846 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
847
848 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
849 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
850 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
851 */
852
853 U0 = vec_mergeh (U,U);
854 V0 = vec_mergeh (V,V);
855
856 U1 = vec_mergel (U,U);
857 V1 = vec_mergel (V,V);
858
859 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
860 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
861
862 R = vec_packclp (R0,R1);
863 G = vec_packclp (G0,G1);
864 B = vec_packclp (B0,B1);
865
866 switch(c->dstFormat) {
867 case PIX_FMT_ABGR: out_abgr (R,G,B,out); break;
868 case PIX_FMT_BGRA: out_bgra (R,G,B,out); break;
869 case PIX_FMT_RGBA: out_rgba (R,G,B,out); break;
870 case PIX_FMT_ARGB: out_argb (R,G,B,out); break;
871 case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
872 case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
873 default:
874 {
875 /* If this is reached, the caller should have called yuv2packedXinC
876 instead. */
877 static int printed_error_message;
878 if (!printed_error_message) {
879 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
880 sws_format_name(c->dstFormat));
881 printed_error_message=1;
882 }
883 return;
884 }
885 }
886 }
887
888 if (i < dstW) {
889 i -= 16;
890
891 Y0 = RND;
892 Y1 = RND;
893 /* extract 16 coeffs from lumSrc */
894 for (j=0; j<lumFilterSize; j++) {
895 X0 = vec_ld (0, &lumSrc[j][i]);
896 X1 = vec_ld (16, &lumSrc[j][i]);
897 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
898 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
899 }
900
901 U = RND;
902 V = RND;
903 /* extract 8 coeffs from U,V */
904 for (j=0; j<chrFilterSize; j++) {
905 X = vec_ld (0, &chrSrc[j][i/2]);
906 U = vec_mradds (X, CCoeffs[j], U);
907 X = vec_ld (0, &chrSrc[j][i/2+2048]);
908 V = vec_mradds (X, CCoeffs[j], V);
909 }
910
911 /* scale and clip signals */
912 Y0 = vec_sra (Y0, SCL);
913 Y1 = vec_sra (Y1, SCL);
914 U = vec_sra (U, SCL);
915 V = vec_sra (V, SCL);
916
917 Y0 = vec_clip_s16 (Y0);
918 Y1 = vec_clip_s16 (Y1);
919 U = vec_clip_s16 (U);
920 V = vec_clip_s16 (V);
921
922 /* now we have
923 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
924 U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
925
926 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
927 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
928 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
929 */
930
931 U0 = vec_mergeh (U,U);
932 V0 = vec_mergeh (V,V);
933
934 U1 = vec_mergel (U,U);
935 V1 = vec_mergel (V,V);
936
937 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
938 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
939
940 R = vec_packclp (R0,R1);
941 G = vec_packclp (G0,G1);
942 B = vec_packclp (B0,B1);
943
944 nout = (vector unsigned char *)scratch;
945 switch(c->dstFormat) {
946 case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break;
947 case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break;
948 case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break;
949 case PIX_FMT_ARGB: out_argb (R,G,B,nout); break;
950 case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
951 case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
952 default:
953 /* Unreachable, I think. */
954 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
955 sws_format_name(c->dstFormat));
956 return;
957 }
958
959 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
960 }
961
962 }