Mercurial > mplayer.hg
annotate postproc/yuv2rgb_altivec.c @ 18049:77a3b0d11ca5
Limit the number of entires to the amount that does fit into the chunk.
the function need rewrite as it assumes quite many things that are not guaranteed by the specifications.
author | iive |
---|---|
date | Thu, 06 Apr 2006 20:04:02 +0000 |
parents | 194a848d907f |
children | 62cdcca26777 |
rev | line source |
---|---|
12698 | 1 /* |
2 marc.hoffman@analog.com March 8, 2004 | |
3 | |
4 Altivec Acceleration for Color Space Conversion revision 0.2 | |
5 | |
6 convert I420 YV12 to RGB in various formats, | |
7 it rejects images that are not in 420 formats | |
8 it rejects images that don't have widths of multiples of 16 | |
9 it rejects images that don't have heights of multiples of 2 | |
10 reject defers to C simulation codes. | |
11 | |
12 lots of optimizations to be done here | |
13 | |
14 1. need to fix saturation code, I just couldn't get it to fly with packs and adds. | |
15 so we currently use max min to clip | |
16 | |
17 2. the inefficient use of chroma loading needs a bit of brushing up | |
18 | |
19 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls | |
20 | |
21 | |
22 MODIFIED to calculate coeffs from currently selected color space. | |
23 MODIFIED core to be a macro which you spec the output format. | |
24 ADDED UYVY conversion which is never called due to some thing in SWSCALE. | |
25 CORRECTED algorithim selection to be strict on input formats. | |
26 ADDED runtime detection of altivec. | |
27 | |
28 ADDED altivec_yuv2packedX vertical scl + RGB converter | |
29 | |
30 March 27,2004 | |
31 PERFORMANCE ANALYSIS | |
32 | |
33 The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test | |
34 The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence | |
35 | |
36 720*480*30 ~10MPS | |
37 | |
38 so we have roughly 10clocks per pixel this is too high something has to be wrong. | |
39 | |
40 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min. | |
41 | |
42 OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much | |
43 guaranteed to have the input video frame it was just decompressed so | |
44 it probably resides in L1 caches. However we are creating the | |
45 output video stream this needs to use the DSTST instruction to | |
46 optimize for the cache. We couple this with the fact that we are | |
47 not going to be visiting the input buffer again so we mark it Least | |
48 Recently Used. This shaves 25% of the processor cycles off. | |
49 | |
50 Now MEMCPY is the largest mips consumer in the system, probably due | |
51 to the inefficient X11 stuff. | |
52 | |
53 GL libraries seem to be very slow on this machine 1.33Ghz PB running | |
54 Jaguar, this is not the case for my 1Ghz PB. I thought it might be | |
55 a versioning issues, however i have libGL.1.2.dylib for both | |
56 machines. ((We need to figure this out now)) | |
57 | |
58 GL2 libraries work now with patch for RGB32 | |
59 | |
60 NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor | |
61 | |
62 Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment. | |
63 | |
64 */ | |
65 #include <stdio.h> | |
66 #include <stdlib.h> | |
12836 | 67 #include <string.h> |
12698 | 68 #include <inttypes.h> |
69 #include <assert.h> | |
70 #include "config.h" | |
17558
ad90899eeee6
AltiVec operations need to have memory aligned on 16-byte boundaries.
diego
parents:
17557
diff
changeset
|
71 #ifdef HAVE_MALLOC_H |
ad90899eeee6
AltiVec operations need to have memory aligned on 16-byte boundaries.
diego
parents:
17557
diff
changeset
|
72 #include <malloc.h> |
ad90899eeee6
AltiVec operations need to have memory aligned on 16-byte boundaries.
diego
parents:
17557
diff
changeset
|
73 #endif |
12698 | 74 #include "rgb2rgb.h" |
75 #include "swscale.h" | |
76 #include "swscale_internal.h" | |
16985 | 77 #include "mangle.h" |
78 #include "libvo/img_format.h" //FIXME try to reduce dependency of such stuff | |
12698 | 79 |
80 #undef PROFILE_THE_BEAST | |
81 #undef INC_SCALING | |
82 | |
83 typedef unsigned char ubyte; | |
84 typedef signed char sbyte; | |
85 | |
86 | |
87 /* RGB interleaver, 16 planar pels 8-bit samples per channel in | |
88 homogeneous vector registers x0,x1,x2 are interleaved with the | |
89 following technique: | |
90 | |
91 o0 = vec_mergeh (x0,x1); | |
92 o1 = vec_perm (o0, x2, perm_rgb_0); | |
93 o2 = vec_perm (o0, x2, perm_rgb_1); | |
94 o3 = vec_mergel (x0,x1); | |
95 o4 = vec_perm (o3,o2,perm_rgb_2); | |
96 o5 = vec_perm (o3,o2,perm_rgb_3); | |
97 | |
98 perm_rgb_0: o0(RG).h v1(B) --> o1* | |
99 0 1 2 3 4 | |
100 rgbr|gbrg|brgb|rgbr | |
101 0010 0100 1001 0010 | |
102 0102 3145 2673 894A | |
103 | |
104 perm_rgb_1: o0(RG).h v1(B) --> o2 | |
105 0 1 2 3 4 | |
106 gbrg|brgb|bbbb|bbbb | |
107 0100 1001 1111 1111 | |
108 B5CD 6EF7 89AB CDEF | |
109 | |
110 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4* | |
111 0 1 2 3 4 | |
112 gbrg|brgb|rgbr|gbrg | |
113 1111 1111 0010 0100 | |
114 89AB CDEF 0182 3945 | |
115 | |
116 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5* | |
117 0 1 2 3 4 | |
118 brgb|rgbr|gbrg|brgb | |
119 1001 0010 0100 1001 | |
120 a67b 89cA BdCD eEFf | |
121 | |
122 */ | |
123 static | |
124 const vector unsigned char | |
13564 | 125 perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05, |
12698 | 126 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a), |
13564 | 127 perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17, |
12698 | 128 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f), |
13564 | 129 perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, |
12698 | 130 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05), |
13564 | 131 perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a, |
12698 | 132 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f); |
133 | |
134 #define vec_merge3(x2,x1,x0,y0,y1,y2) \ | |
135 do { \ | |
136 typeof(x0) o0,o2,o3; \ | |
137 o0 = vec_mergeh (x0,x1); \ | |
138 y0 = vec_perm (o0, x2, perm_rgb_0);\ | |
139 o2 = vec_perm (o0, x2, perm_rgb_1);\ | |
140 o3 = vec_mergel (x0,x1); \ | |
141 y1 = vec_perm (o3,o2,perm_rgb_2); \ | |
142 y2 = vec_perm (o3,o2,perm_rgb_3); \ | |
143 } while(0) | |
144 | |
17563
8084bcdb4898
Correct RGB vs. BGR confusion, the macros vec_mstrgb24 and vec_mstbgr24 each
diego
parents:
17560
diff
changeset
|
145 #define vec_mstbgr24(x0,x1,x2,ptr) \ |
12698 | 146 do { \ |
147 typeof(x0) _0,_1,_2; \ | |
148 vec_merge3 (x0,x1,x2,_0,_1,_2); \ | |
149 vec_st (_0, 0, ptr++); \ | |
150 vec_st (_1, 0, ptr++); \ | |
151 vec_st (_2, 0, ptr++); \ | |
152 } while (0); | |
153 | |
17563
8084bcdb4898
Correct RGB vs. BGR confusion, the macros vec_mstrgb24 and vec_mstbgr24 each
diego
parents:
17560
diff
changeset
|
154 #define vec_mstrgb24(x0,x1,x2,ptr) \ |
12698 | 155 do { \ |
156 typeof(x0) _0,_1,_2; \ | |
157 vec_merge3 (x2,x1,x0,_0,_1,_2); \ | |
158 vec_st (_0, 0, ptr++); \ | |
159 vec_st (_1, 0, ptr++); \ | |
160 vec_st (_2, 0, ptr++); \ | |
161 } while (0); | |
162 | |
163 /* pack the pixels in rgb0 format | |
164 msb R | |
165 lsb 0 | |
166 */ | |
167 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \ | |
168 do { \ | |
169 T _0,_1,_2,_3; \ | |
170 _0 = vec_mergeh (x0,x1); \ | |
171 _1 = vec_mergeh (x2,x3); \ | |
172 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ | |
173 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ | |
174 vec_st (_2, 0*16, (T *)ptr); \ | |
175 vec_st (_3, 1*16, (T *)ptr); \ | |
176 _0 = vec_mergel (x0,x1); \ | |
177 _1 = vec_mergel (x2,x3); \ | |
178 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ | |
179 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ | |
180 vec_st (_2, 2*16, (T *)ptr); \ | |
181 vec_st (_3, 3*16, (T *)ptr); \ | |
182 ptr += 4; \ | |
183 } while (0); | |
184 | |
185 /* | |
186 | |
187 | 1 0 1.4021 | | Y | | |
188 | 1 -0.3441 -0.7142 |x| Cb| | |
189 | 1 1.7718 0 | | Cr| | |
190 | |
191 | |
192 Y: [-128 127] | |
193 Cb/Cr : [-128 127] | |
194 | |
195 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode. | |
196 | |
197 */ | |
198 | |
199 | |
200 | |
201 | |
202 #define vec_unh(x) \ | |
203 (vector signed short) \ | |
13564 | 204 vec_perm(x,(typeof(x))AVV(0),\ |
205 (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\ | |
12698 | 206 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07)) |
207 #define vec_unl(x) \ | |
208 (vector signed short) \ | |
13564 | 209 vec_perm(x,(typeof(x))AVV(0),\ |
210 (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\ | |
12698 | 211 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F)) |
212 | |
17638
ee8d71a1b7ba
Fix vec_clip for gcc's that don't handle (vector signed short){16} according
pacman
parents:
17588
diff
changeset
|
213 #define vec_clip_s16(x) \ |
ee8d71a1b7ba
Fix vec_clip for gcc's that don't handle (vector signed short){16} according
pacman
parents:
17588
diff
changeset
|
214 vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\ |
ee8d71a1b7ba
Fix vec_clip for gcc's that don't handle (vector signed short){16} according
pacman
parents:
17588
diff
changeset
|
215 (vector signed short)AVV(16, 16, 16, 16, 16, 16, 16, 16 )) |
12698 | 216 |
217 #define vec_packclp(x,y) \ | |
218 (vector unsigned char)vec_packs \ | |
13564 | 219 ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \ |
220 (vector unsigned short)vec_max (y,(vector signed short) AVV(0))) | |
12698 | 221 |
13564 | 222 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr) |
12698 | 223 |
224 | |
12836 | 225 static inline void cvtyuvtoRGB (SwsContext *c, |
12698 | 226 vector signed short Y, vector signed short U, vector signed short V, |
227 vector signed short *R, vector signed short *G, vector signed short *B) | |
228 { | |
229 vector signed short vx,ux,uvx; | |
230 | |
231 Y = vec_mradds (Y, c->CY, c->OY); | |
13564 | 232 U = vec_sub (U,(vector signed short) |
233 vec_splat((vector signed short)AVV(128),0)); | |
234 V = vec_sub (V,(vector signed short) | |
235 vec_splat((vector signed short)AVV(128),0)); | |
12698 | 236 |
237 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15; | |
238 ux = vec_sl (U, c->CSHIFT); | |
239 *B = vec_mradds (ux, c->CBU, Y); | |
240 | |
241 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15; | |
242 vx = vec_sl (V, c->CSHIFT); | |
243 *R = vec_mradds (vx, c->CRV, Y); | |
244 | |
245 // uvx = ((CGU*u) + (CGV*v))>>15; | |
246 uvx = vec_mradds (U, c->CGU, Y); | |
247 *G = vec_mradds (V, c->CGV, uvx); | |
248 } | |
249 | |
250 | |
251 /* | |
252 ------------------------------------------------------------------------------ | |
253 CS converters | |
254 ------------------------------------------------------------------------------ | |
255 */ | |
256 | |
257 | |
258 #define DEFCSP420_CVT(name,out_pixels) \ | |
259 static int altivec_##name (SwsContext *c, \ | |
260 unsigned char **in, int *instrides, \ | |
261 int srcSliceY, int srcSliceH, \ | |
262 unsigned char **oplanes, int *outstrides) \ | |
263 { \ | |
264 int w = c->srcW; \ | |
265 int h = srcSliceH; \ | |
266 int i,j; \ | |
267 int instrides_scl[3]; \ | |
268 vector unsigned char y0,y1; \ | |
269 \ | |
270 vector signed char u,v; \ | |
271 \ | |
272 vector signed short Y0,Y1,Y2,Y3; \ | |
273 vector signed short U,V; \ | |
274 vector signed short vx,ux,uvx; \ | |
275 vector signed short vx0,ux0,uvx0; \ | |
276 vector signed short vx1,ux1,uvx1; \ | |
277 vector signed short R0,G0,B0; \ | |
278 vector signed short R1,G1,B1; \ | |
279 vector unsigned char R,G,B; \ | |
280 \ | |
281 vector unsigned char *uivP, *vivP; \ | |
282 vector unsigned char align_perm; \ | |
283 \ | |
284 vector signed short \ | |
285 lCY = c->CY, \ | |
286 lOY = c->OY, \ | |
287 lCRV = c->CRV, \ | |
288 lCBU = c->CBU, \ | |
289 lCGU = c->CGU, \ | |
290 lCGV = c->CGV; \ | |
291 \ | |
292 vector unsigned short lCSHIFT = c->CSHIFT; \ | |
293 \ | |
294 ubyte *y1i = in[0]; \ | |
295 ubyte *y2i = in[0]+w; \ | |
296 ubyte *ui = in[1]; \ | |
297 ubyte *vi = in[2]; \ | |
298 \ | |
299 vector unsigned char *oute \ | |
300 = (vector unsigned char *) \ | |
301 (oplanes[0]+srcSliceY*outstrides[0]); \ | |
302 vector unsigned char *outo \ | |
303 = (vector unsigned char *) \ | |
304 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \ | |
305 \ | |
306 \ | |
307 instrides_scl[0] = instrides[0]; \ | |
308 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \ | |
309 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \ | |
310 \ | |
311 \ | |
312 for (i=0;i<h/2;i++) { \ | |
313 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \ | |
314 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \ | |
315 \ | |
316 for (j=0;j<w/16;j++) { \ | |
317 \ | |
318 y0 = vec_ldl (0,y1i); \ | |
319 y1 = vec_ldl (0,y2i); \ | |
320 uivP = (vector unsigned char *)ui; \ | |
321 vivP = (vector unsigned char *)vi; \ | |
322 \ | |
323 align_perm = vec_lvsl (0, ui); \ | |
324 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \ | |
325 \ | |
326 align_perm = vec_lvsl (0, vi); \ | |
327 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \ | |
328 \ | |
13564 | 329 u = (vector signed char) \ |
330 vec_sub (u,(vector signed char) \ | |
331 vec_splat((vector signed char)AVV(128),0));\ | |
332 v = (vector signed char) \ | |
333 vec_sub (v,(vector signed char) \ | |
334 vec_splat((vector signed char)AVV(128),0));\ | |
335 \ | |
12698 | 336 U = vec_unpackh (u); \ |
337 V = vec_unpackh (v); \ | |
338 \ | |
339 \ | |
340 Y0 = vec_unh (y0); \ | |
341 Y1 = vec_unl (y0); \ | |
342 Y2 = vec_unh (y1); \ | |
343 Y3 = vec_unl (y1); \ | |
344 \ | |
345 Y0 = vec_mradds (Y0, lCY, lOY); \ | |
346 Y1 = vec_mradds (Y1, lCY, lOY); \ | |
347 Y2 = vec_mradds (Y2, lCY, lOY); \ | |
348 Y3 = vec_mradds (Y3, lCY, lOY); \ | |
349 \ | |
350 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \ | |
351 ux = vec_sl (U, lCSHIFT); \ | |
13564 | 352 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \ |
12698 | 353 ux0 = vec_mergeh (ux,ux); \ |
354 ux1 = vec_mergel (ux,ux); \ | |
355 \ | |
356 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \ | |
357 vx = vec_sl (V, lCSHIFT); \ | |
13564 | 358 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \ |
12698 | 359 vx0 = vec_mergeh (vx,vx); \ |
360 vx1 = vec_mergel (vx,vx); \ | |
361 \ | |
362 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \ | |
13564 | 363 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \ |
12698 | 364 uvx = vec_mradds (V, lCGV, uvx); \ |
365 uvx0 = vec_mergeh (uvx,uvx); \ | |
366 uvx1 = vec_mergel (uvx,uvx); \ | |
367 \ | |
368 R0 = vec_add (Y0,vx0); \ | |
369 G0 = vec_add (Y0,uvx0); \ | |
370 B0 = vec_add (Y0,ux0); \ | |
371 R1 = vec_add (Y1,vx1); \ | |
372 G1 = vec_add (Y1,uvx1); \ | |
373 B1 = vec_add (Y1,ux1); \ | |
374 \ | |
375 R = vec_packclp (R0,R1); \ | |
376 G = vec_packclp (G0,G1); \ | |
377 B = vec_packclp (B0,B1); \ | |
378 \ | |
379 out_pixels(R,G,B,oute); \ | |
380 \ | |
381 R0 = vec_add (Y2,vx0); \ | |
382 G0 = vec_add (Y2,uvx0); \ | |
383 B0 = vec_add (Y2,ux0); \ | |
384 R1 = vec_add (Y3,vx1); \ | |
385 G1 = vec_add (Y3,uvx1); \ | |
386 B1 = vec_add (Y3,ux1); \ | |
387 R = vec_packclp (R0,R1); \ | |
388 G = vec_packclp (G0,G1); \ | |
389 B = vec_packclp (B0,B1); \ | |
390 \ | |
391 \ | |
392 out_pixels(R,G,B,outo); \ | |
393 \ | |
394 y1i += 16; \ | |
395 y2i += 16; \ | |
396 ui += 8; \ | |
397 vi += 8; \ | |
398 \ | |
399 } \ | |
400 \ | |
401 outo += (outstrides[0])>>4; \ | |
402 oute += (outstrides[0])>>4; \ | |
403 \ | |
404 ui += instrides_scl[1]; \ | |
405 vi += instrides_scl[2]; \ | |
406 y1i += instrides_scl[0]; \ | |
407 y2i += instrides_scl[0]; \ | |
408 } \ | |
409 return srcSliceH; \ | |
410 } | |
411 | |
412 | |
13564 | 413 #define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr) |
414 #define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr) | |
415 #define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr) | |
416 #define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr) | |
12698 | 417 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr) |
17563
8084bcdb4898
Correct RGB vs. BGR confusion, the macros vec_mstrgb24 and vec_mstbgr24 each
diego
parents:
17560
diff
changeset
|
418 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr) |
12698 | 419 |
17844
194a848d907f
Support all 4 of ARGB/RGBA/BGRA/ABGR unscaled conversions, with more
pacman
parents:
17642
diff
changeset
|
420 DEFCSP420_CVT (yuv2_abgr, out_abgr) |
13564 | 421 #if 1 |
17844
194a848d907f
Support all 4 of ARGB/RGBA/BGRA/ABGR unscaled conversions, with more
pacman
parents:
17642
diff
changeset
|
422 DEFCSP420_CVT (yuv2_bgra, out_bgra) |
13564 | 423 #else |
424 static int altivec_yuv2_bgra32 (SwsContext *c, | |
425 unsigned char **in, int *instrides, | |
426 int srcSliceY, int srcSliceH, | |
427 unsigned char **oplanes, int *outstrides) | |
428 { | |
429 int w = c->srcW; | |
430 int h = srcSliceH; | |
431 int i,j; | |
432 int instrides_scl[3]; | |
433 vector unsigned char y0,y1; | |
434 | |
435 vector signed char u,v; | |
436 | |
437 vector signed short Y0,Y1,Y2,Y3; | |
438 vector signed short U,V; | |
439 vector signed short vx,ux,uvx; | |
440 vector signed short vx0,ux0,uvx0; | |
441 vector signed short vx1,ux1,uvx1; | |
442 vector signed short R0,G0,B0; | |
443 vector signed short R1,G1,B1; | |
444 vector unsigned char R,G,B; | |
445 | |
446 vector unsigned char *uivP, *vivP; | |
447 vector unsigned char align_perm; | |
448 | |
449 vector signed short | |
450 lCY = c->CY, | |
451 lOY = c->OY, | |
452 lCRV = c->CRV, | |
453 lCBU = c->CBU, | |
454 lCGU = c->CGU, | |
455 lCGV = c->CGV; | |
456 | |
457 vector unsigned short lCSHIFT = c->CSHIFT; | |
458 | |
459 ubyte *y1i = in[0]; | |
460 ubyte *y2i = in[0]+w; | |
461 ubyte *ui = in[1]; | |
462 ubyte *vi = in[2]; | |
463 | |
464 vector unsigned char *oute | |
465 = (vector unsigned char *) | |
466 (oplanes[0]+srcSliceY*outstrides[0]); | |
467 vector unsigned char *outo | |
468 = (vector unsigned char *) | |
469 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); | |
470 | |
471 | |
472 instrides_scl[0] = instrides[0]; | |
473 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ | |
474 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ | |
475 | |
476 | |
477 for (i=0;i<h/2;i++) { | |
478 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); | |
479 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); | |
480 | |
481 for (j=0;j<w/16;j++) { | |
482 | |
483 y0 = vec_ldl (0,y1i); | |
484 y1 = vec_ldl (0,y2i); | |
485 uivP = (vector unsigned char *)ui; | |
486 vivP = (vector unsigned char *)vi; | |
487 | |
488 align_perm = vec_lvsl (0, ui); | |
489 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); | |
490 | |
491 align_perm = vec_lvsl (0, vi); | |
492 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); | |
493 u = (vector signed char) | |
494 vec_sub (u,(vector signed char) | |
495 vec_splat((vector signed char)AVV(128),0)); | |
496 | |
497 v = (vector signed char) | |
498 vec_sub (v, (vector signed char) | |
499 vec_splat((vector signed char)AVV(128),0)); | |
500 | |
501 U = vec_unpackh (u); | |
502 V = vec_unpackh (v); | |
503 | |
504 | |
505 Y0 = vec_unh (y0); | |
506 Y1 = vec_unl (y0); | |
507 Y2 = vec_unh (y1); | |
508 Y3 = vec_unl (y1); | |
509 | |
510 Y0 = vec_mradds (Y0, lCY, lOY); | |
511 Y1 = vec_mradds (Y1, lCY, lOY); | |
512 Y2 = vec_mradds (Y2, lCY, lOY); | |
513 Y3 = vec_mradds (Y3, lCY, lOY); | |
514 | |
515 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ | |
516 ux = vec_sl (U, lCSHIFT); | |
517 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); | |
518 ux0 = vec_mergeh (ux,ux); | |
519 ux1 = vec_mergel (ux,ux); | |
520 | |
521 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ | |
522 vx = vec_sl (V, lCSHIFT); | |
523 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); | |
524 vx0 = vec_mergeh (vx,vx); | |
525 vx1 = vec_mergel (vx,vx); | |
526 /* uvx = ((CGU*u) + (CGV*v))>>15 */ | |
527 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); | |
528 uvx = vec_mradds (V, lCGV, uvx); | |
529 uvx0 = vec_mergeh (uvx,uvx); | |
530 uvx1 = vec_mergel (uvx,uvx); | |
531 R0 = vec_add (Y0,vx0); | |
532 G0 = vec_add (Y0,uvx0); | |
533 B0 = vec_add (Y0,ux0); | |
534 R1 = vec_add (Y1,vx1); | |
535 G1 = vec_add (Y1,uvx1); | |
536 B1 = vec_add (Y1,ux1); | |
537 R = vec_packclp (R0,R1); | |
538 G = vec_packclp (G0,G1); | |
539 B = vec_packclp (B0,B1); | |
540 | |
541 out_argb(R,G,B,oute); | |
542 R0 = vec_add (Y2,vx0); | |
543 G0 = vec_add (Y2,uvx0); | |
544 B0 = vec_add (Y2,ux0); | |
545 R1 = vec_add (Y3,vx1); | |
546 G1 = vec_add (Y3,uvx1); | |
547 B1 = vec_add (Y3,ux1); | |
548 R = vec_packclp (R0,R1); | |
549 G = vec_packclp (G0,G1); | |
550 B = vec_packclp (B0,B1); | |
551 | |
552 out_argb(R,G,B,outo); | |
553 y1i += 16; | |
554 y2i += 16; | |
555 ui += 8; | |
556 vi += 8; | |
557 | |
558 } | |
559 | |
560 outo += (outstrides[0])>>4; | |
561 oute += (outstrides[0])>>4; | |
562 | |
563 ui += instrides_scl[1]; | |
564 vi += instrides_scl[2]; | |
565 y1i += instrides_scl[0]; | |
566 y2i += instrides_scl[0]; | |
567 } | |
568 return srcSliceH; | |
569 } | |
570 | |
571 #endif | |
572 | |
573 | |
17844
194a848d907f
Support all 4 of ARGB/RGBA/BGRA/ABGR unscaled conversions, with more
pacman
parents:
17642
diff
changeset
|
574 DEFCSP420_CVT (yuv2_rgba, out_rgba) |
194a848d907f
Support all 4 of ARGB/RGBA/BGRA/ABGR unscaled conversions, with more
pacman
parents:
17642
diff
changeset
|
575 DEFCSP420_CVT (yuv2_argb, out_argb) |
12698 | 576 DEFCSP420_CVT (yuv2_rgb24, out_rgb24) |
577 DEFCSP420_CVT (yuv2_bgr24, out_bgr24) | |
578 | |
579 | |
580 // uyvy|uyvy|uyvy|uyvy | |
581 // 0123 4567 89ab cdef | |
582 static | |
583 const vector unsigned char | |
13564 | 584 demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00, |
12698 | 585 0x10,0x04,0x10,0x04, |
586 0x10,0x08,0x10,0x08, | |
587 0x10,0x0c,0x10,0x0c), | |
13564 | 588 demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02, |
12698 | 589 0x10,0x06,0x10,0x06, |
590 0x10,0x0A,0x10,0x0A, | |
591 0x10,0x0E,0x10,0x0E), | |
13564 | 592 demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03, |
12698 | 593 0x10,0x05,0x10,0x07, |
594 0x10,0x09,0x10,0x0B, | |
595 0x10,0x0D,0x10,0x0F); | |
596 | |
597 /* | |
598 this is so I can play live CCIR raw video | |
599 */ | |
600 static int altivec_uyvy_rgb32 (SwsContext *c, | |
601 unsigned char **in, int *instrides, | |
602 int srcSliceY, int srcSliceH, | |
603 unsigned char **oplanes, int *outstrides) | |
604 { | |
605 int w = c->srcW; | |
606 int h = srcSliceH; | |
607 int i,j; | |
608 vector unsigned char uyvy; | |
609 vector signed short Y,U,V; | |
610 vector signed short vx,ux,uvx; | |
611 vector signed short R0,G0,B0,R1,G1,B1; | |
612 vector unsigned char R,G,B; | |
613 vector unsigned char *out; | |
614 ubyte *img; | |
615 | |
616 img = in[0]; | |
617 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]); | |
618 | |
619 for (i=0;i<h;i++) { | |
620 for (j=0;j<w/16;j++) { | |
621 uyvy = vec_ld (0, img); | |
622 U = (vector signed short) | |
13564 | 623 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u); |
12698 | 624 |
625 V = (vector signed short) | |
13564 | 626 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v); |
12698 | 627 |
628 Y = (vector signed short) | |
13564 | 629 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y); |
12698 | 630 |
631 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0); | |
632 | |
633 uyvy = vec_ld (16, img); | |
634 U = (vector signed short) | |
13564 | 635 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u); |
12698 | 636 |
637 V = (vector signed short) | |
13564 | 638 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v); |
12698 | 639 |
640 Y = (vector signed short) | |
13564 | 641 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y); |
12698 | 642 |
643 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1); | |
644 | |
645 R = vec_packclp (R0,R1); | |
646 G = vec_packclp (G0,G1); | |
647 B = vec_packclp (B0,B1); | |
648 | |
649 // vec_mstbgr24 (R,G,B, out); | |
650 out_rgba (R,G,B,out); | |
651 | |
652 img += 32; | |
653 } | |
654 } | |
12836 | 655 return srcSliceH; |
12698 | 656 } |
657 | |
658 | |
659 | |
660 /* Ok currently the acceleration routine only supports | |
661 inputs of widths a multiple of 16 | |
662 and heights a multiple 2 | |
663 | |
664 So we just fall back to the C codes for this. | |
665 */ | |
666 SwsFunc yuv2rgb_init_altivec (SwsContext *c) | |
667 { | |
668 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC)) | |
669 return NULL; | |
670 | |
671 /* | |
672 and this seems not to matter too much I tried a bunch of | |
673 videos with abnormal widths and mplayer crashes else where. | |
674 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv | |
675 boom with X11 bad match. | |
676 | |
677 */ | |
678 if ((c->srcW & 0xf) != 0) return NULL; | |
679 | |
680 switch (c->srcFormat) { | |
681 case IMGFMT_YVU9: | |
682 case IMGFMT_IF09: | |
683 case IMGFMT_YV12: | |
684 case IMGFMT_I420: | |
685 case IMGFMT_IYUV: | |
686 case IMGFMT_CLPL: | |
687 case IMGFMT_Y800: | |
688 case IMGFMT_Y8: | |
689 case IMGFMT_NV12: | |
690 case IMGFMT_NV21: | |
691 if ((c->srcH & 0x1) != 0) | |
692 return NULL; | |
693 | |
694 switch(c->dstFormat){ | |
695 case IMGFMT_RGB24: | |
696 MSG_WARN("ALTIVEC: Color Space RGB24\n"); | |
697 return altivec_yuv2_rgb24; | |
698 case IMGFMT_BGR24: | |
699 MSG_WARN("ALTIVEC: Color Space BGR24\n"); | |
700 return altivec_yuv2_bgr24; | |
17844
194a848d907f
Support all 4 of ARGB/RGBA/BGRA/ABGR unscaled conversions, with more
pacman
parents:
17642
diff
changeset
|
701 case IMGFMT_ARGB: |
194a848d907f
Support all 4 of ARGB/RGBA/BGRA/ABGR unscaled conversions, with more
pacman
parents:
17642
diff
changeset
|
702 MSG_WARN("ALTIVEC: Color Space ARGB\n"); |
194a848d907f
Support all 4 of ARGB/RGBA/BGRA/ABGR unscaled conversions, with more
pacman
parents:
17642
diff
changeset
|
703 return altivec_yuv2_argb; |
194a848d907f
Support all 4 of ARGB/RGBA/BGRA/ABGR unscaled conversions, with more
pacman
parents:
17642
diff
changeset
|
704 case IMGFMT_ABGR: |
194a848d907f
Support all 4 of ARGB/RGBA/BGRA/ABGR unscaled conversions, with more
pacman
parents:
17642
diff
changeset
|
705 MSG_WARN("ALTIVEC: Color Space ABGR\n"); |
194a848d907f
Support all 4 of ARGB/RGBA/BGRA/ABGR unscaled conversions, with more
pacman
parents:
17642
diff
changeset
|
706 return altivec_yuv2_abgr; |
194a848d907f
Support all 4 of ARGB/RGBA/BGRA/ABGR unscaled conversions, with more
pacman
parents:
17642
diff
changeset
|
707 case IMGFMT_RGBA: |
194a848d907f
Support all 4 of ARGB/RGBA/BGRA/ABGR unscaled conversions, with more
pacman
parents:
17642
diff
changeset
|
708 MSG_WARN("ALTIVEC: Color Space RGBA\n"); |
194a848d907f
Support all 4 of ARGB/RGBA/BGRA/ABGR unscaled conversions, with more
pacman
parents:
17642
diff
changeset
|
709 return altivec_yuv2_rgba; |
194a848d907f
Support all 4 of ARGB/RGBA/BGRA/ABGR unscaled conversions, with more
pacman
parents:
17642
diff
changeset
|
710 case IMGFMT_BGRA: |
194a848d907f
Support all 4 of ARGB/RGBA/BGRA/ABGR unscaled conversions, with more
pacman
parents:
17642
diff
changeset
|
711 MSG_WARN("ALTIVEC: Color Space BGRA\n"); |
194a848d907f
Support all 4 of ARGB/RGBA/BGRA/ABGR unscaled conversions, with more
pacman
parents:
17642
diff
changeset
|
712 return altivec_yuv2_bgra; |
12698 | 713 default: return NULL; |
714 } | |
715 break; | |
716 | |
717 case IMGFMT_UYVY: | |
718 switch(c->dstFormat){ | |
719 case IMGFMT_RGB32: | |
720 MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n"); | |
721 return altivec_uyvy_rgb32; | |
722 default: return NULL; | |
723 } | |
724 break; | |
725 | |
726 } | |
727 return NULL; | |
728 } | |
729 | |
13564 | 730 static uint16_t roundToInt16(int64_t f){ |
731 int r= (f + (1<<15))>>16; | |
732 if(r<-0x7FFF) return 0x8000; | |
733 else if(r> 0x7FFF) return 0x7FFF; | |
734 else return r; | |
735 } | |
12698 | 736 |
13564 | 737 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation) |
738 { | |
739 union { | |
740 signed short tmp[8] __attribute__ ((aligned(16))); | |
741 vector signed short vec; | |
742 } buf; | |
12698 | 743 |
13564 | 744 buf.tmp[0] = ( (0xffffLL) * contrast>>8 )>>9; //cy |
745 buf.tmp[1] = -256*brightness; //oy | |
746 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv | |
747 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu | |
748 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu | |
749 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv | |
12698 | 750 |
751 | |
17638
ee8d71a1b7ba
Fix vec_clip for gcc's that don't handle (vector signed short){16} according
pacman
parents:
17588
diff
changeset
|
752 c->CSHIFT = (vector unsigned short)vec_splat_u16(2); |
13564 | 753 c->CY = vec_splat ((vector signed short)buf.vec, 0); |
754 c->OY = vec_splat ((vector signed short)buf.vec, 1); | |
755 c->CRV = vec_splat ((vector signed short)buf.vec, 2); | |
756 c->CBU = vec_splat ((vector signed short)buf.vec, 3); | |
757 c->CGU = vec_splat ((vector signed short)buf.vec, 4); | |
758 c->CGV = vec_splat ((vector signed short)buf.vec, 5); | |
12836 | 759 #if 0 |
13564 | 760 { |
761 int i; | |
762 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"}; | |
763 for (i=0; i<6;i++) | |
764 printf("%s %d ", v[i],buf.tmp[i] ); | |
765 printf("\n"); | |
766 } | |
12698 | 767 #endif |
12836 | 768 return; |
12698 | 769 } |
770 | |
771 | |
772 void | |
773 altivec_yuv2packedX (SwsContext *c, | |
774 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | |
775 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
776 uint8_t *dest, int dstW, int dstY) | |
777 { | |
778 int i,j; | |
779 short *f; | |
780 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V; | |
781 vector signed short R0,G0,B0,R1,G1,B1; | |
782 | |
783 vector unsigned char R,G,B,pels[3]; | |
784 vector unsigned char *out,*nout; | |
13564 | 785 |
17638
ee8d71a1b7ba
Fix vec_clip for gcc's that don't handle (vector signed short){16} according
pacman
parents:
17588
diff
changeset
|
786 vector signed short RND = vec_splat_s16(1<<3); |
ee8d71a1b7ba
Fix vec_clip for gcc's that don't handle (vector signed short){16} according
pacman
parents:
17588
diff
changeset
|
787 vector unsigned short SCL = vec_splat_u16(4); |
12698 | 788 unsigned long scratch[16] __attribute__ ((aligned (16))); |
789 | |
790 vector signed short *YCoeffs, *CCoeffs; | |
791 | |
17588
79081ba52e00
Move the v{Y,C}CoeffsBank vectors into the SwsContext, filling them in just
diego
parents:
17563
diff
changeset
|
792 YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize; |
79081ba52e00
Move the v{Y,C}CoeffsBank vectors into the SwsContext, filling them in just
diego
parents:
17563
diff
changeset
|
793 CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize; |
12698 | 794 |
795 out = (vector unsigned char *)dest; | |
796 | |
797 for(i=0; i<dstW; i+=16){ | |
798 Y0 = RND; | |
799 Y1 = RND; | |
800 /* extract 16 coeffs from lumSrc */ | |
801 for(j=0; j<lumFilterSize; j++) { | |
802 X0 = vec_ld (0, &lumSrc[j][i]); | |
803 X1 = vec_ld (16, &lumSrc[j][i]); | |
804 Y0 = vec_mradds (X0, YCoeffs[j], Y0); | |
805 Y1 = vec_mradds (X1, YCoeffs[j], Y1); | |
806 } | |
807 | |
808 U = RND; | |
809 V = RND; | |
810 /* extract 8 coeffs from U,V */ | |
811 for(j=0; j<chrFilterSize; j++) { | |
812 X = vec_ld (0, &chrSrc[j][i/2]); | |
813 U = vec_mradds (X, CCoeffs[j], U); | |
814 X = vec_ld (0, &chrSrc[j][i/2+2048]); | |
815 V = vec_mradds (X, CCoeffs[j], V); | |
816 } | |
817 | |
818 /* scale and clip signals */ | |
819 Y0 = vec_sra (Y0, SCL); | |
820 Y1 = vec_sra (Y1, SCL); | |
821 U = vec_sra (U, SCL); | |
822 V = vec_sra (V, SCL); | |
823 | |
17638
ee8d71a1b7ba
Fix vec_clip for gcc's that don't handle (vector signed short){16} according
pacman
parents:
17588
diff
changeset
|
824 Y0 = vec_clip_s16 (Y0); |
ee8d71a1b7ba
Fix vec_clip for gcc's that don't handle (vector signed short){16} according
pacman
parents:
17588
diff
changeset
|
825 Y1 = vec_clip_s16 (Y1); |
ee8d71a1b7ba
Fix vec_clip for gcc's that don't handle (vector signed short){16} according
pacman
parents:
17588
diff
changeset
|
826 U = vec_clip_s16 (U); |
ee8d71a1b7ba
Fix vec_clip for gcc's that don't handle (vector signed short){16} according
pacman
parents:
17588
diff
changeset
|
827 V = vec_clip_s16 (V); |
12698 | 828 |
829 /* now we have | |
830 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 | |
831 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7 | |
832 | |
833 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 | |
834 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 | |
835 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 | |
836 */ | |
837 | |
838 U0 = vec_mergeh (U,U); | |
839 V0 = vec_mergeh (V,V); | |
840 | |
841 U1 = vec_mergel (U,U); | |
842 V1 = vec_mergel (V,V); | |
843 | |
844 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); | |
845 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); | |
846 | |
847 R = vec_packclp (R0,R1); | |
848 G = vec_packclp (G0,G1); | |
849 B = vec_packclp (B0,B1); | |
850 | |
17560
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
851 switch(c->dstFormat) { |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
852 case IMGFMT_ABGR: out_abgr (R,G,B,out); break; |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
853 case IMGFMT_BGRA: out_bgra (R,G,B,out); break; |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
854 case IMGFMT_RGBA: out_rgba (R,G,B,out); break; |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
855 case IMGFMT_ARGB: out_argb (R,G,B,out); break; |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
856 case IMGFMT_RGB24: out_rgb24 (R,G,B,out); break; |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
857 case IMGFMT_BGR24: out_bgr24 (R,G,B,out); break; |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
858 default: |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
859 { |
17642
64e5c4e34f07
Just a comment update, replacing a FIXME to reflect the new expectation of
pacman
parents:
17638
diff
changeset
|
860 /* If this is reached, the caller should have called yuv2packedXinC |
64e5c4e34f07
Just a comment update, replacing a FIXME to reflect the new expectation of
pacman
parents:
17638
diff
changeset
|
861 instead. */ |
17560
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
862 static int printed_error_message; |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
863 if(!printed_error_message) { |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
864 MSG_ERR("altivec_yuv2packedX doesn't support %s output\n", |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
865 vo_format_name(c->dstFormat)); |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
866 printed_error_message=1; |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
867 } |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
868 return; |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
869 } |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
870 } |
12698 | 871 } |
872 | |
873 if (i < dstW) { | |
874 i -= 16; | |
875 | |
876 Y0 = RND; | |
877 Y1 = RND; | |
878 /* extract 16 coeffs from lumSrc */ | |
879 for(j=0; j<lumFilterSize; j++) { | |
880 X0 = vec_ld (0, &lumSrc[j][i]); | |
881 X1 = vec_ld (16, &lumSrc[j][i]); | |
882 Y0 = vec_mradds (X0, YCoeffs[j], Y0); | |
883 Y1 = vec_mradds (X1, YCoeffs[j], Y1); | |
884 } | |
885 | |
886 U = RND; | |
887 V = RND; | |
888 /* extract 8 coeffs from U,V */ | |
889 for(j=0; j<chrFilterSize; j++) { | |
890 X = vec_ld (0, &chrSrc[j][i/2]); | |
891 U = vec_mradds (X, CCoeffs[j], U); | |
892 X = vec_ld (0, &chrSrc[j][i/2+2048]); | |
893 V = vec_mradds (X, CCoeffs[j], V); | |
894 } | |
895 | |
896 /* scale and clip signals */ | |
897 Y0 = vec_sra (Y0, SCL); | |
898 Y1 = vec_sra (Y1, SCL); | |
899 U = vec_sra (U, SCL); | |
900 V = vec_sra (V, SCL); | |
901 | |
17638
ee8d71a1b7ba
Fix vec_clip for gcc's that don't handle (vector signed short){16} according
pacman
parents:
17588
diff
changeset
|
902 Y0 = vec_clip_s16 (Y0); |
ee8d71a1b7ba
Fix vec_clip for gcc's that don't handle (vector signed short){16} according
pacman
parents:
17588
diff
changeset
|
903 Y1 = vec_clip_s16 (Y1); |
ee8d71a1b7ba
Fix vec_clip for gcc's that don't handle (vector signed short){16} according
pacman
parents:
17588
diff
changeset
|
904 U = vec_clip_s16 (U); |
ee8d71a1b7ba
Fix vec_clip for gcc's that don't handle (vector signed short){16} according
pacman
parents:
17588
diff
changeset
|
905 V = vec_clip_s16 (V); |
12698 | 906 |
907 /* now we have | |
908 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 | |
909 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7 | |
910 | |
911 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 | |
912 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 | |
913 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 | |
914 */ | |
915 | |
916 U0 = vec_mergeh (U,U); | |
917 V0 = vec_mergeh (V,V); | |
918 | |
919 U1 = vec_mergel (U,U); | |
920 V1 = vec_mergel (V,V); | |
921 | |
922 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); | |
923 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); | |
924 | |
925 R = vec_packclp (R0,R1); | |
926 G = vec_packclp (G0,G1); | |
927 B = vec_packclp (B0,B1); | |
928 | |
929 nout = (vector unsigned char *)scratch; | |
17560
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
930 switch(c->dstFormat) { |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
931 case IMGFMT_ABGR: out_abgr (R,G,B,nout); break; |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
932 case IMGFMT_BGRA: out_bgra (R,G,B,nout); break; |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
933 case IMGFMT_RGBA: out_rgba (R,G,B,nout); break; |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
934 case IMGFMT_ARGB: out_argb (R,G,B,nout); break; |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
935 case IMGFMT_RGB24: out_rgb24 (R,G,B,nout); break; |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
936 case IMGFMT_BGR24: out_bgr24 (R,G,B,nout); break; |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
937 default: |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
938 /* Unreachable, I think. */ |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
939 MSG_ERR("altivec_yuv2packedX doesn't support %s output\n", |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
940 vo_format_name(c->dstFormat)); |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
941 return; |
1a54f39404b9
altivec_yuv2packedX() ignores the requested output format and unconditionally
diego
parents:
17558
diff
changeset
|
942 } |
12698 | 943 |
944 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4); | |
945 } | |
946 | |
947 } |