Mercurial > mplayer.hg
annotate libswscale/ppc/yuv2rgb_altivec.c @ 30701:3c1f75f4affe
mention that on windows, you've to use the netstream.exe to build the
netstream binary
author | attila |
---|---|
date | Fri, 26 Feb 2010 12:49:49 +0000 |
parents | 4d50825554ee |
children | 140bde72c97f |
rev | line source |
---|---|
29028 | 1 /* |
2 * AltiVec acceleration for colorspace conversion | |
3 * | |
4 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com> | |
5 * | |
6 * This file is part of FFmpeg. | |
7 * | |
8 * FFmpeg is free software; you can redistribute it and/or | |
9 * modify it under the terms of the GNU Lesser General Public | |
10 * License as published by the Free Software Foundation; either | |
11 * version 2.1 of the License, or (at your option) any later version. | |
12 * | |
13 * FFmpeg is distributed in the hope that it will be useful, | |
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 * Lesser General Public License for more details. | |
17 * | |
18 * You should have received a copy of the GNU Lesser General Public | |
19 * License along with FFmpeg; if not, write to the Free Software | |
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 */ | |
22 | |
23 /* | |
24 Convert I420 YV12 to RGB in various formats, | |
25 it rejects images that are not in 420 formats, | |
26 it rejects images that don't have widths of multiples of 16, | |
27 it rejects images that don't have heights of multiples of 2. | |
28 Reject defers to C simulation code. | |
29 | |
30 Lots of optimizations to be done here. | |
31 | |
32 1. Need to fix saturation code. I just couldn't get it to fly with packs | |
33 and adds, so we currently use max/min to clip. | |
34 | |
35 2. The inefficient use of chroma loading needs a bit of brushing up. | |
36 | |
37 3. Analysis of pipeline stalls needs to be done. Use shark to identify | |
38 pipeline stalls. | |
39 | |
40 | |
41 MODIFIED to calculate coeffs from currently selected color space. | |
42 MODIFIED core to be a macro where you specify the output format. | |
43 ADDED UYVY conversion which is never called due to some thing in swscale. | |
44 CORRECTED algorithim selection to be strict on input formats. | |
45 ADDED runtime detection of AltiVec. | |
46 | |
47 ADDED altivec_yuv2packedX vertical scl + RGB converter | |
48 | |
49 March 27,2004 | |
50 PERFORMANCE ANALYSIS | |
51 | |
52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo | |
53 used as test. | |
54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video | |
55 same sequence. | |
56 | |
57 720 * 480 * 30 ~10MPS | |
58 | |
59 so we have roughly 10 clocks per pixel. This is too high, something has | |
60 to be wrong. | |
61 | |
62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the | |
63 need for vec_min. | |
64 | |
65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have | |
66 the input video frame, it was just decompressed so it probably resides in L1 | |
67 caches. However, we are creating the output video stream. This needs to use the | |
68 DSTST instruction to optimize for the cache. We couple this with the fact that | |
69 we are not going to be visiting the input buffer again so we mark it Least | |
70 Recently Used. This shaves 25% of the processor cycles off. | |
71 | |
72 Now memcpy is the largest mips consumer in the system, probably due | |
73 to the inefficient X11 stuff. | |
74 | |
75 GL libraries seem to be very slow on this machine 1.33Ghz PB running | |
76 Jaguar, this is not the case for my 1Ghz PB. I thought it might be | |
77 a versioning issue, however I have libGL.1.2.dylib for both | |
78 machines. (We need to figure this out now.) | |
79 | |
80 GL2 libraries work now with patch for RGB32. | |
81 | |
82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor. | |
83 | |
84 Integrated luma prescaling adjustment for saturation/contrast/brightness | |
85 adjustment. | |
86 */ | |
87 | |
88 #include <stdio.h> | |
89 #include <stdlib.h> | |
90 #include <string.h> | |
91 #include <inttypes.h> | |
92 #include <assert.h> | |
93 #include "config.h" | |
94 #include "libswscale/rgb2rgb.h" | |
95 #include "libswscale/swscale.h" | |
96 #include "libswscale/swscale_internal.h" | |
97 | |
98 #undef PROFILE_THE_BEAST | |
99 #undef INC_SCALING | |
100 | |
101 typedef unsigned char ubyte; | |
102 typedef signed char sbyte; | |
103 | |
104 | |
105 /* RGB interleaver, 16 planar pels 8-bit samples per channel in | |
106 homogeneous vector registers x0,x1,x2 are interleaved with the | |
107 following technique: | |
108 | |
109 o0 = vec_mergeh (x0,x1); | |
110 o1 = vec_perm (o0, x2, perm_rgb_0); | |
111 o2 = vec_perm (o0, x2, perm_rgb_1); | |
112 o3 = vec_mergel (x0,x1); | |
113 o4 = vec_perm (o3,o2,perm_rgb_2); | |
114 o5 = vec_perm (o3,o2,perm_rgb_3); | |
115 | |
116 perm_rgb_0: o0(RG).h v1(B) --> o1* | |
117 0 1 2 3 4 | |
118 rgbr|gbrg|brgb|rgbr | |
119 0010 0100 1001 0010 | |
120 0102 3145 2673 894A | |
121 | |
122 perm_rgb_1: o0(RG).h v1(B) --> o2 | |
123 0 1 2 3 4 | |
124 gbrg|brgb|bbbb|bbbb | |
125 0100 1001 1111 1111 | |
126 B5CD 6EF7 89AB CDEF | |
127 | |
128 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4* | |
129 0 1 2 3 4 | |
130 gbrg|brgb|rgbr|gbrg | |
131 1111 1111 0010 0100 | |
132 89AB CDEF 0182 3945 | |
133 | |
134 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5* | |
135 0 1 2 3 4 | |
136 brgb|rgbr|gbrg|brgb | |
137 1001 0010 0100 1001 | |
138 a67b 89cA BdCD eEFf | |
139 | |
140 */ | |
141 static | |
142 const vector unsigned char | |
143 perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05, | |
144 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a}, | |
145 perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17, | |
146 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f}, | |
147 perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, | |
148 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05}, | |
149 perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a, | |
150 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f}; | |
151 | |
152 #define vec_merge3(x2,x1,x0,y0,y1,y2) \ | |
153 do { \ | |
154 __typeof__(x0) o0,o2,o3; \ | |
155 o0 = vec_mergeh (x0,x1); \ | |
156 y0 = vec_perm (o0, x2, perm_rgb_0); \ | |
157 o2 = vec_perm (o0, x2, perm_rgb_1); \ | |
158 o3 = vec_mergel (x0,x1); \ | |
159 y1 = vec_perm (o3,o2,perm_rgb_2); \ | |
160 y2 = vec_perm (o3,o2,perm_rgb_3); \ | |
161 } while(0) | |
162 | |
163 #define vec_mstbgr24(x0,x1,x2,ptr) \ | |
164 do { \ | |
165 __typeof__(x0) _0,_1,_2; \ | |
166 vec_merge3 (x0,x1,x2,_0,_1,_2); \ | |
167 vec_st (_0, 0, ptr++); \ | |
168 vec_st (_1, 0, ptr++); \ | |
169 vec_st (_2, 0, ptr++); \ | |
29632 | 170 } while (0) |
29028 | 171 |
172 #define vec_mstrgb24(x0,x1,x2,ptr) \ | |
173 do { \ | |
174 __typeof__(x0) _0,_1,_2; \ | |
175 vec_merge3 (x2,x1,x0,_0,_1,_2); \ | |
176 vec_st (_0, 0, ptr++); \ | |
177 vec_st (_1, 0, ptr++); \ | |
178 vec_st (_2, 0, ptr++); \ | |
29632 | 179 } while (0) |
29028 | 180 |
181 /* pack the pixels in rgb0 format | |
182 msb R | |
183 lsb 0 | |
184 */ | |
185 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \ | |
186 do { \ | |
187 T _0,_1,_2,_3; \ | |
188 _0 = vec_mergeh (x0,x1); \ | |
189 _1 = vec_mergeh (x2,x3); \ | |
190 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ | |
191 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ | |
192 vec_st (_2, 0*16, (T *)ptr); \ | |
193 vec_st (_3, 1*16, (T *)ptr); \ | |
194 _0 = vec_mergel (x0,x1); \ | |
195 _1 = vec_mergel (x2,x3); \ | |
196 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ | |
197 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ | |
198 vec_st (_2, 2*16, (T *)ptr); \ | |
199 vec_st (_3, 3*16, (T *)ptr); \ | |
200 ptr += 4; \ | |
29632 | 201 } while (0) |
29028 | 202 |
203 /* | |
204 | |
205 | 1 0 1.4021 | | Y | | |
206 | 1 -0.3441 -0.7142 |x| Cb| | |
207 | 1 1.7718 0 | | Cr| | |
208 | |
209 | |
210 Y: [-128 127] | |
211 Cb/Cr : [-128 127] | |
212 | |
213 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode. | |
214 | |
215 */ | |
216 | |
217 | |
218 | |
219 | |
220 #define vec_unh(x) \ | |
221 (vector signed short) \ | |
222 vec_perm(x,(__typeof__(x)){0}, \ | |
223 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\ | |
224 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07})) | |
225 #define vec_unl(x) \ | |
226 (vector signed short) \ | |
227 vec_perm(x,(__typeof__(x)){0}, \ | |
228 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\ | |
229 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F})) | |
230 | |
231 #define vec_clip_s16(x) \ | |
232 vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \ | |
233 ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16})) | |
234 | |
235 #define vec_packclp(x,y) \ | |
236 (vector unsigned char)vec_packs \ | |
237 ((vector unsigned short)vec_max (x,((vector signed short) {0})), \ | |
238 (vector unsigned short)vec_max (y,((vector signed short) {0}))) | |
239 | |
240 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr) | |
241 | |
242 | |
243 static inline void cvtyuvtoRGB (SwsContext *c, | |
244 vector signed short Y, vector signed short U, vector signed short V, | |
245 vector signed short *R, vector signed short *G, vector signed short *B) | |
246 { | |
247 vector signed short vx,ux,uvx; | |
248 | |
249 Y = vec_mradds (Y, c->CY, c->OY); | |
250 U = vec_sub (U,(vector signed short) | |
251 vec_splat((vector signed short){128},0)); | |
252 V = vec_sub (V,(vector signed short) | |
253 vec_splat((vector signed short){128},0)); | |
254 | |
255 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15; | |
256 ux = vec_sl (U, c->CSHIFT); | |
257 *B = vec_mradds (ux, c->CBU, Y); | |
258 | |
259 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15; | |
260 vx = vec_sl (V, c->CSHIFT); | |
261 *R = vec_mradds (vx, c->CRV, Y); | |
262 | |
263 // uvx = ((CGU*u) + (CGV*v))>>15; | |
264 uvx = vec_mradds (U, c->CGU, Y); | |
265 *G = vec_mradds (V, c->CGV, uvx); | |
266 } | |
267 | |
268 | |
269 /* | |
270 ------------------------------------------------------------------------------ | |
271 CS converters | |
272 ------------------------------------------------------------------------------ | |
273 */ | |
274 | |
275 | |
276 #define DEFCSP420_CVT(name,out_pixels) \ | |
277 static int altivec_##name (SwsContext *c, \ | |
278 unsigned char **in, int *instrides, \ | |
279 int srcSliceY, int srcSliceH, \ | |
280 unsigned char **oplanes, int *outstrides) \ | |
281 { \ | |
282 int w = c->srcW; \ | |
283 int h = srcSliceH; \ | |
284 int i,j; \ | |
285 int instrides_scl[3]; \ | |
286 vector unsigned char y0,y1; \ | |
287 \ | |
288 vector signed char u,v; \ | |
289 \ | |
290 vector signed short Y0,Y1,Y2,Y3; \ | |
291 vector signed short U,V; \ | |
292 vector signed short vx,ux,uvx; \ | |
293 vector signed short vx0,ux0,uvx0; \ | |
294 vector signed short vx1,ux1,uvx1; \ | |
295 vector signed short R0,G0,B0; \ | |
296 vector signed short R1,G1,B1; \ | |
297 vector unsigned char R,G,B; \ | |
298 \ | |
299 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \ | |
300 vector unsigned char align_perm; \ | |
301 \ | |
302 vector signed short \ | |
303 lCY = c->CY, \ | |
304 lOY = c->OY, \ | |
305 lCRV = c->CRV, \ | |
306 lCBU = c->CBU, \ | |
307 lCGU = c->CGU, \ | |
308 lCGV = c->CGV; \ | |
309 \ | |
310 vector unsigned short lCSHIFT = c->CSHIFT; \ | |
311 \ | |
312 ubyte *y1i = in[0]; \ | |
313 ubyte *y2i = in[0]+instrides[0]; \ | |
314 ubyte *ui = in[1]; \ | |
315 ubyte *vi = in[2]; \ | |
316 \ | |
317 vector unsigned char *oute \ | |
318 = (vector unsigned char *) \ | |
319 (oplanes[0]+srcSliceY*outstrides[0]); \ | |
320 vector unsigned char *outo \ | |
321 = (vector unsigned char *) \ | |
322 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \ | |
323 \ | |
324 \ | |
325 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \ | |
326 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \ | |
327 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \ | |
328 \ | |
329 \ | |
330 for (i=0;i<h/2;i++) { \ | |
331 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \ | |
332 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \ | |
333 \ | |
334 for (j=0;j<w/16;j++) { \ | |
335 \ | |
336 y1ivP = (vector unsigned char *)y1i; \ | |
337 y2ivP = (vector unsigned char *)y2i; \ | |
338 uivP = (vector unsigned char *)ui; \ | |
339 vivP = (vector unsigned char *)vi; \ | |
340 \ | |
341 align_perm = vec_lvsl (0, y1i); \ | |
342 y0 = (vector unsigned char) \ | |
343 vec_perm (y1ivP[0], y1ivP[1], align_perm); \ | |
344 \ | |
345 align_perm = vec_lvsl (0, y2i); \ | |
346 y1 = (vector unsigned char) \ | |
347 vec_perm (y2ivP[0], y2ivP[1], align_perm); \ | |
348 \ | |
349 align_perm = vec_lvsl (0, ui); \ | |
350 u = (vector signed char) \ | |
351 vec_perm (uivP[0], uivP[1], align_perm); \ | |
352 \ | |
353 align_perm = vec_lvsl (0, vi); \ | |
354 v = (vector signed char) \ | |
355 vec_perm (vivP[0], vivP[1], align_perm); \ | |
356 \ | |
357 u = (vector signed char) \ | |
358 vec_sub (u,(vector signed char) \ | |
359 vec_splat((vector signed char){128},0)); \ | |
360 v = (vector signed char) \ | |
361 vec_sub (v,(vector signed char) \ | |
362 vec_splat((vector signed char){128},0)); \ | |
363 \ | |
364 U = vec_unpackh (u); \ | |
365 V = vec_unpackh (v); \ | |
366 \ | |
367 \ | |
368 Y0 = vec_unh (y0); \ | |
369 Y1 = vec_unl (y0); \ | |
370 Y2 = vec_unh (y1); \ | |
371 Y3 = vec_unl (y1); \ | |
372 \ | |
373 Y0 = vec_mradds (Y0, lCY, lOY); \ | |
374 Y1 = vec_mradds (Y1, lCY, lOY); \ | |
375 Y2 = vec_mradds (Y2, lCY, lOY); \ | |
376 Y3 = vec_mradds (Y3, lCY, lOY); \ | |
377 \ | |
378 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \ | |
379 ux = vec_sl (U, lCSHIFT); \ | |
380 ux = vec_mradds (ux, lCBU, (vector signed short){0}); \ | |
381 ux0 = vec_mergeh (ux,ux); \ | |
382 ux1 = vec_mergel (ux,ux); \ | |
383 \ | |
384 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \ | |
385 vx = vec_sl (V, lCSHIFT); \ | |
386 vx = vec_mradds (vx, lCRV, (vector signed short){0}); \ | |
387 vx0 = vec_mergeh (vx,vx); \ | |
388 vx1 = vec_mergel (vx,vx); \ | |
389 \ | |
390 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \ | |
391 uvx = vec_mradds (U, lCGU, (vector signed short){0}); \ | |
392 uvx = vec_mradds (V, lCGV, uvx); \ | |
393 uvx0 = vec_mergeh (uvx,uvx); \ | |
394 uvx1 = vec_mergel (uvx,uvx); \ | |
395 \ | |
396 R0 = vec_add (Y0,vx0); \ | |
397 G0 = vec_add (Y0,uvx0); \ | |
398 B0 = vec_add (Y0,ux0); \ | |
399 R1 = vec_add (Y1,vx1); \ | |
400 G1 = vec_add (Y1,uvx1); \ | |
401 B1 = vec_add (Y1,ux1); \ | |
402 \ | |
403 R = vec_packclp (R0,R1); \ | |
404 G = vec_packclp (G0,G1); \ | |
405 B = vec_packclp (B0,B1); \ | |
406 \ | |
407 out_pixels(R,G,B,oute); \ | |
408 \ | |
409 R0 = vec_add (Y2,vx0); \ | |
410 G0 = vec_add (Y2,uvx0); \ | |
411 B0 = vec_add (Y2,ux0); \ | |
412 R1 = vec_add (Y3,vx1); \ | |
413 G1 = vec_add (Y3,uvx1); \ | |
414 B1 = vec_add (Y3,ux1); \ | |
415 R = vec_packclp (R0,R1); \ | |
416 G = vec_packclp (G0,G1); \ | |
417 B = vec_packclp (B0,B1); \ | |
418 \ | |
419 \ | |
420 out_pixels(R,G,B,outo); \ | |
421 \ | |
422 y1i += 16; \ | |
423 y2i += 16; \ | |
424 ui += 8; \ | |
425 vi += 8; \ | |
426 \ | |
427 } \ | |
428 \ | |
429 outo += (outstrides[0])>>4; \ | |
430 oute += (outstrides[0])>>4; \ | |
431 \ | |
432 ui += instrides_scl[1]; \ | |
433 vi += instrides_scl[2]; \ | |
434 y1i += instrides_scl[0]; \ | |
435 y2i += instrides_scl[0]; \ | |
436 } \ | |
437 return srcSliceH; \ | |
438 } | |
439 | |
440 | |
441 #define out_abgr(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr) | |
442 #define out_bgra(a,b,c,ptr) vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr) | |
443 #define out_rgba(a,b,c,ptr) vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr) | |
444 #define out_argb(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr) | |
445 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr) | |
446 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr) | |
447 | |
448 DEFCSP420_CVT (yuv2_abgr, out_abgr) | |
449 #if 1 | |
450 DEFCSP420_CVT (yuv2_bgra, out_bgra) | |
451 #else | |
452 static int altivec_yuv2_bgra32 (SwsContext *c, | |
453 unsigned char **in, int *instrides, | |
454 int srcSliceY, int srcSliceH, | |
455 unsigned char **oplanes, int *outstrides) | |
456 { | |
457 int w = c->srcW; | |
458 int h = srcSliceH; | |
459 int i,j; | |
460 int instrides_scl[3]; | |
461 vector unsigned char y0,y1; | |
462 | |
463 vector signed char u,v; | |
464 | |
465 vector signed short Y0,Y1,Y2,Y3; | |
466 vector signed short U,V; | |
467 vector signed short vx,ux,uvx; | |
468 vector signed short vx0,ux0,uvx0; | |
469 vector signed short vx1,ux1,uvx1; | |
470 vector signed short R0,G0,B0; | |
471 vector signed short R1,G1,B1; | |
472 vector unsigned char R,G,B; | |
473 | |
474 vector unsigned char *uivP, *vivP; | |
475 vector unsigned char align_perm; | |
476 | |
477 vector signed short | |
478 lCY = c->CY, | |
479 lOY = c->OY, | |
480 lCRV = c->CRV, | |
481 lCBU = c->CBU, | |
482 lCGU = c->CGU, | |
483 lCGV = c->CGV; | |
484 | |
485 vector unsigned short lCSHIFT = c->CSHIFT; | |
486 | |
487 ubyte *y1i = in[0]; | |
488 ubyte *y2i = in[0]+w; | |
489 ubyte *ui = in[1]; | |
490 ubyte *vi = in[2]; | |
491 | |
492 vector unsigned char *oute | |
493 = (vector unsigned char *) | |
494 (oplanes[0]+srcSliceY*outstrides[0]); | |
495 vector unsigned char *outo | |
496 = (vector unsigned char *) | |
497 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); | |
498 | |
499 | |
500 instrides_scl[0] = instrides[0]; | |
501 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ | |
502 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ | |
503 | |
504 | |
505 for (i=0;i<h/2;i++) { | |
506 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); | |
507 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); | |
508 | |
509 for (j=0;j<w/16;j++) { | |
510 | |
511 y0 = vec_ldl (0,y1i); | |
512 y1 = vec_ldl (0,y2i); | |
513 uivP = (vector unsigned char *)ui; | |
514 vivP = (vector unsigned char *)vi; | |
515 | |
516 align_perm = vec_lvsl (0, ui); | |
517 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); | |
518 | |
519 align_perm = vec_lvsl (0, vi); | |
520 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); | |
521 u = (vector signed char) | |
522 vec_sub (u,(vector signed char) | |
523 vec_splat((vector signed char){128},0)); | |
524 | |
525 v = (vector signed char) | |
526 vec_sub (v, (vector signed char) | |
527 vec_splat((vector signed char){128},0)); | |
528 | |
529 U = vec_unpackh (u); | |
530 V = vec_unpackh (v); | |
531 | |
532 | |
533 Y0 = vec_unh (y0); | |
534 Y1 = vec_unl (y0); | |
535 Y2 = vec_unh (y1); | |
536 Y3 = vec_unl (y1); | |
537 | |
538 Y0 = vec_mradds (Y0, lCY, lOY); | |
539 Y1 = vec_mradds (Y1, lCY, lOY); | |
540 Y2 = vec_mradds (Y2, lCY, lOY); | |
541 Y3 = vec_mradds (Y3, lCY, lOY); | |
542 | |
543 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ | |
544 ux = vec_sl (U, lCSHIFT); | |
545 ux = vec_mradds (ux, lCBU, (vector signed short){0}); | |
546 ux0 = vec_mergeh (ux,ux); | |
547 ux1 = vec_mergel (ux,ux); | |
548 | |
549 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ | |
550 vx = vec_sl (V, lCSHIFT); | |
551 vx = vec_mradds (vx, lCRV, (vector signed short){0}); | |
552 vx0 = vec_mergeh (vx,vx); | |
553 vx1 = vec_mergel (vx,vx); | |
554 /* uvx = ((CGU*u) + (CGV*v))>>15 */ | |
555 uvx = vec_mradds (U, lCGU, (vector signed short){0}); | |
556 uvx = vec_mradds (V, lCGV, uvx); | |
557 uvx0 = vec_mergeh (uvx,uvx); | |
558 uvx1 = vec_mergel (uvx,uvx); | |
559 R0 = vec_add (Y0,vx0); | |
560 G0 = vec_add (Y0,uvx0); | |
561 B0 = vec_add (Y0,ux0); | |
562 R1 = vec_add (Y1,vx1); | |
563 G1 = vec_add (Y1,uvx1); | |
564 B1 = vec_add (Y1,ux1); | |
565 R = vec_packclp (R0,R1); | |
566 G = vec_packclp (G0,G1); | |
567 B = vec_packclp (B0,B1); | |
568 | |
569 out_argb(R,G,B,oute); | |
570 R0 = vec_add (Y2,vx0); | |
571 G0 = vec_add (Y2,uvx0); | |
572 B0 = vec_add (Y2,ux0); | |
573 R1 = vec_add (Y3,vx1); | |
574 G1 = vec_add (Y3,uvx1); | |
575 B1 = vec_add (Y3,ux1); | |
576 R = vec_packclp (R0,R1); | |
577 G = vec_packclp (G0,G1); | |
578 B = vec_packclp (B0,B1); | |
579 | |
580 out_argb(R,G,B,outo); | |
581 y1i += 16; | |
582 y2i += 16; | |
583 ui += 8; | |
584 vi += 8; | |
585 | |
586 } | |
587 | |
588 outo += (outstrides[0])>>4; | |
589 oute += (outstrides[0])>>4; | |
590 | |
591 ui += instrides_scl[1]; | |
592 vi += instrides_scl[2]; | |
593 y1i += instrides_scl[0]; | |
594 y2i += instrides_scl[0]; | |
595 } | |
596 return srcSliceH; | |
597 } | |
598 | |
599 #endif | |
600 | |
601 | |
602 DEFCSP420_CVT (yuv2_rgba, out_rgba) | |
603 DEFCSP420_CVT (yuv2_argb, out_argb) | |
604 DEFCSP420_CVT (yuv2_rgb24, out_rgb24) | |
605 DEFCSP420_CVT (yuv2_bgr24, out_bgr24) | |
606 | |
607 | |
608 // uyvy|uyvy|uyvy|uyvy | |
609 // 0123 4567 89ab cdef | |
610 static | |
611 const vector unsigned char | |
612 demux_u = {0x10,0x00,0x10,0x00, | |
613 0x10,0x04,0x10,0x04, | |
614 0x10,0x08,0x10,0x08, | |
615 0x10,0x0c,0x10,0x0c}, | |
616 demux_v = {0x10,0x02,0x10,0x02, | |
617 0x10,0x06,0x10,0x06, | |
618 0x10,0x0A,0x10,0x0A, | |
619 0x10,0x0E,0x10,0x0E}, | |
620 demux_y = {0x10,0x01,0x10,0x03, | |
621 0x10,0x05,0x10,0x07, | |
622 0x10,0x09,0x10,0x0B, | |
623 0x10,0x0D,0x10,0x0F}; | |
624 | |
625 /* | |
626 this is so I can play live CCIR raw video | |
627 */ | |
628 static int altivec_uyvy_rgb32 (SwsContext *c, | |
629 unsigned char **in, int *instrides, | |
630 int srcSliceY, int srcSliceH, | |
631 unsigned char **oplanes, int *outstrides) | |
632 { | |
633 int w = c->srcW; | |
634 int h = srcSliceH; | |
635 int i,j; | |
636 vector unsigned char uyvy; | |
637 vector signed short Y,U,V; | |
638 vector signed short R0,G0,B0,R1,G1,B1; | |
639 vector unsigned char R,G,B; | |
640 vector unsigned char *out; | |
641 ubyte *img; | |
642 | |
643 img = in[0]; | |
644 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]); | |
645 | |
646 for (i=0;i<h;i++) { | |
647 for (j=0;j<w/16;j++) { | |
648 uyvy = vec_ld (0, img); | |
649 U = (vector signed short) | |
650 vec_perm (uyvy, (vector unsigned char){0}, demux_u); | |
651 | |
652 V = (vector signed short) | |
653 vec_perm (uyvy, (vector unsigned char){0}, demux_v); | |
654 | |
655 Y = (vector signed short) | |
656 vec_perm (uyvy, (vector unsigned char){0}, demux_y); | |
657 | |
658 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0); | |
659 | |
660 uyvy = vec_ld (16, img); | |
661 U = (vector signed short) | |
662 vec_perm (uyvy, (vector unsigned char){0}, demux_u); | |
663 | |
664 V = (vector signed short) | |
665 vec_perm (uyvy, (vector unsigned char){0}, demux_v); | |
666 | |
667 Y = (vector signed short) | |
668 vec_perm (uyvy, (vector unsigned char){0}, demux_y); | |
669 | |
670 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1); | |
671 | |
672 R = vec_packclp (R0,R1); | |
673 G = vec_packclp (G0,G1); | |
674 B = vec_packclp (B0,B1); | |
675 | |
676 // vec_mstbgr24 (R,G,B, out); | |
677 out_rgba (R,G,B,out); | |
678 | |
679 img += 32; | |
680 } | |
681 } | |
682 return srcSliceH; | |
683 } | |
684 | |
685 | |
686 | |
687 /* Ok currently the acceleration routine only supports | |
688 inputs of widths a multiple of 16 | |
689 and heights a multiple 2 | |
690 | |
691 So we just fall back to the C codes for this. | |
692 */ | |
693 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c) | |
694 { | |
695 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC)) | |
696 return NULL; | |
697 | |
698 /* | |
699 and this seems not to matter too much I tried a bunch of | |
700 videos with abnormal widths and MPlayer crashes elsewhere. | |
701 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv | |
702 boom with X11 bad match. | |
703 | |
704 */ | |
705 if ((c->srcW & 0xf) != 0) return NULL; | |
706 | |
707 switch (c->srcFormat) { | |
708 case PIX_FMT_YUV410P: | |
709 case PIX_FMT_YUV420P: | |
710 /*case IMGFMT_CLPL: ??? */ | |
711 case PIX_FMT_GRAY8: | |
712 case PIX_FMT_NV12: | |
713 case PIX_FMT_NV21: | |
714 if ((c->srcH & 0x1) != 0) | |
715 return NULL; | |
716 | |
29481 | 717 switch(c->dstFormat) { |
29028 | 718 case PIX_FMT_RGB24: |
719 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n"); | |
720 return altivec_yuv2_rgb24; | |
721 case PIX_FMT_BGR24: | |
722 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n"); | |
723 return altivec_yuv2_bgr24; | |
724 case PIX_FMT_ARGB: | |
725 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n"); | |
726 return altivec_yuv2_argb; | |
727 case PIX_FMT_ABGR: | |
728 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n"); | |
729 return altivec_yuv2_abgr; | |
730 case PIX_FMT_RGBA: | |
731 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n"); | |
732 return altivec_yuv2_rgba; | |
733 case PIX_FMT_BGRA: | |
734 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n"); | |
735 return altivec_yuv2_bgra; | |
736 default: return NULL; | |
737 } | |
738 break; | |
739 | |
740 case PIX_FMT_UYVY422: | |
29481 | 741 switch(c->dstFormat) { |
29028 | 742 case PIX_FMT_BGR32: |
743 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n"); | |
744 return altivec_uyvy_rgb32; | |
745 default: return NULL; | |
746 } | |
747 break; | |
748 | |
749 } | |
750 return NULL; | |
751 } | |
752 | |
753 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation) | |
754 { | |
755 union { | |
30343
4d50825554ee
Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents:
29632
diff
changeset
|
756 DECLARE_ALIGNED(16, signed short, tmp)[8]; |
29028 | 757 vector signed short vec; |
758 } buf; | |
759 | |
760 buf.tmp[0] = ((0xffffLL) * contrast>>8)>>9; //cy | |
761 buf.tmp[1] = -256*brightness; //oy | |
762 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv | |
763 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu | |
764 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu | |
765 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv | |
766 | |
767 | |
768 c->CSHIFT = (vector unsigned short)vec_splat_u16(2); | |
769 c->CY = vec_splat ((vector signed short)buf.vec, 0); | |
770 c->OY = vec_splat ((vector signed short)buf.vec, 1); | |
771 c->CRV = vec_splat ((vector signed short)buf.vec, 2); | |
772 c->CBU = vec_splat ((vector signed short)buf.vec, 3); | |
773 c->CGU = vec_splat ((vector signed short)buf.vec, 4); | |
774 c->CGV = vec_splat ((vector signed short)buf.vec, 5); | |
775 return; | |
776 } | |
777 | |
778 | |
779 void | |
780 ff_yuv2packedX_altivec(SwsContext *c, | |
29239
882a1f5613e1
Add missing const qualifiers to AltiVec function parameters where appropriate.
diego
parents:
29028
diff
changeset
|
781 const int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
882a1f5613e1
Add missing const qualifiers to AltiVec function parameters where appropriate.
diego
parents:
29028
diff
changeset
|
782 const int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, |
29028 | 783 uint8_t *dest, int dstW, int dstY) |
784 { | |
785 int i,j; | |
786 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V; | |
787 vector signed short R0,G0,B0,R1,G1,B1; | |
788 | |
789 vector unsigned char R,G,B; | |
790 vector unsigned char *out,*nout; | |
791 | |
792 vector signed short RND = vec_splat_s16(1<<3); | |
793 vector unsigned short SCL = vec_splat_u16(4); | |
30343
4d50825554ee
Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents:
29632
diff
changeset
|
794 DECLARE_ALIGNED(16, unsigned long, scratch)[16]; |
29028 | 795 |
796 vector signed short *YCoeffs, *CCoeffs; | |
797 | |
798 YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize; | |
799 CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize; | |
800 | |
801 out = (vector unsigned char *)dest; | |
802 | |
29481 | 803 for (i=0; i<dstW; i+=16) { |
29028 | 804 Y0 = RND; |
805 Y1 = RND; | |
806 /* extract 16 coeffs from lumSrc */ | |
807 for (j=0; j<lumFilterSize; j++) { | |
808 X0 = vec_ld (0, &lumSrc[j][i]); | |
809 X1 = vec_ld (16, &lumSrc[j][i]); | |
810 Y0 = vec_mradds (X0, YCoeffs[j], Y0); | |
811 Y1 = vec_mradds (X1, YCoeffs[j], Y1); | |
812 } | |
813 | |
814 U = RND; | |
815 V = RND; | |
816 /* extract 8 coeffs from U,V */ | |
817 for (j=0; j<chrFilterSize; j++) { | |
818 X = vec_ld (0, &chrSrc[j][i/2]); | |
819 U = vec_mradds (X, CCoeffs[j], U); | |
820 X = vec_ld (0, &chrSrc[j][i/2+2048]); | |
821 V = vec_mradds (X, CCoeffs[j], V); | |
822 } | |
823 | |
824 /* scale and clip signals */ | |
825 Y0 = vec_sra (Y0, SCL); | |
826 Y1 = vec_sra (Y1, SCL); | |
827 U = vec_sra (U, SCL); | |
828 V = vec_sra (V, SCL); | |
829 | |
830 Y0 = vec_clip_s16 (Y0); | |
831 Y1 = vec_clip_s16 (Y1); | |
832 U = vec_clip_s16 (U); | |
833 V = vec_clip_s16 (V); | |
834 | |
835 /* now we have | |
836 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 | |
837 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7 | |
838 | |
839 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 | |
840 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 | |
841 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 | |
842 */ | |
843 | |
844 U0 = vec_mergeh (U,U); | |
845 V0 = vec_mergeh (V,V); | |
846 | |
847 U1 = vec_mergel (U,U); | |
848 V1 = vec_mergel (V,V); | |
849 | |
850 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); | |
851 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); | |
852 | |
853 R = vec_packclp (R0,R1); | |
854 G = vec_packclp (G0,G1); | |
855 B = vec_packclp (B0,B1); | |
856 | |
857 switch(c->dstFormat) { | |
29480 | 858 case PIX_FMT_ABGR: out_abgr (R,G,B,out); break; |
859 case PIX_FMT_BGRA: out_bgra (R,G,B,out); break; | |
860 case PIX_FMT_RGBA: out_rgba (R,G,B,out); break; | |
861 case PIX_FMT_ARGB: out_argb (R,G,B,out); break; | |
862 case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break; | |
863 case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break; | |
864 default: | |
29028 | 865 { |
866 /* If this is reached, the caller should have called yuv2packedXinC | |
867 instead. */ | |
868 static int printed_error_message; | |
869 if (!printed_error_message) { | |
870 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n", | |
871 sws_format_name(c->dstFormat)); | |
872 printed_error_message=1; | |
873 } | |
874 return; | |
875 } | |
876 } | |
877 } | |
878 | |
879 if (i < dstW) { | |
880 i -= 16; | |
881 | |
882 Y0 = RND; | |
883 Y1 = RND; | |
884 /* extract 16 coeffs from lumSrc */ | |
885 for (j=0; j<lumFilterSize; j++) { | |
886 X0 = vec_ld (0, &lumSrc[j][i]); | |
887 X1 = vec_ld (16, &lumSrc[j][i]); | |
888 Y0 = vec_mradds (X0, YCoeffs[j], Y0); | |
889 Y1 = vec_mradds (X1, YCoeffs[j], Y1); | |
890 } | |
891 | |
892 U = RND; | |
893 V = RND; | |
894 /* extract 8 coeffs from U,V */ | |
895 for (j=0; j<chrFilterSize; j++) { | |
896 X = vec_ld (0, &chrSrc[j][i/2]); | |
897 U = vec_mradds (X, CCoeffs[j], U); | |
898 X = vec_ld (0, &chrSrc[j][i/2+2048]); | |
899 V = vec_mradds (X, CCoeffs[j], V); | |
900 } | |
901 | |
902 /* scale and clip signals */ | |
903 Y0 = vec_sra (Y0, SCL); | |
904 Y1 = vec_sra (Y1, SCL); | |
905 U = vec_sra (U, SCL); | |
906 V = vec_sra (V, SCL); | |
907 | |
908 Y0 = vec_clip_s16 (Y0); | |
909 Y1 = vec_clip_s16 (Y1); | |
910 U = vec_clip_s16 (U); | |
911 V = vec_clip_s16 (V); | |
912 | |
913 /* now we have | |
914 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 | |
915 U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7 | |
916 | |
917 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 | |
918 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 | |
919 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 | |
920 */ | |
921 | |
922 U0 = vec_mergeh (U,U); | |
923 V0 = vec_mergeh (V,V); | |
924 | |
925 U1 = vec_mergel (U,U); | |
926 V1 = vec_mergel (V,V); | |
927 | |
928 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); | |
929 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); | |
930 | |
931 R = vec_packclp (R0,R1); | |
932 G = vec_packclp (G0,G1); | |
933 B = vec_packclp (B0,B1); | |
934 | |
935 nout = (vector unsigned char *)scratch; | |
936 switch(c->dstFormat) { | |
29480 | 937 case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break; |
938 case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break; | |
939 case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break; | |
940 case PIX_FMT_ARGB: out_argb (R,G,B,nout); break; | |
941 case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break; | |
942 case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break; | |
943 default: | |
944 /* Unreachable, I think. */ | |
945 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n", | |
946 sws_format_name(c->dstFormat)); | |
947 return; | |
29028 | 948 } |
949 | |
950 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4); | |
951 } | |
952 | |
953 } |