29042
|
1 /*
|
|
2 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
|
|
3 * April 20, 2007
|
|
4 *
|
|
5 * Blackfin video color space converter operations
|
|
6 * convert I420 YV12 to RGB in various formats
|
|
7 *
|
|
8 * This file is part of FFmpeg.
|
|
9 *
|
|
10 * FFmpeg is free software; you can redistribute it and/or
|
|
11 * modify it under the terms of the GNU Lesser General Public
|
|
12 * License as published by the Free Software Foundation; either
|
|
13 * version 2.1 of the License, or (at your option) any later version.
|
|
14 *
|
|
15 * FFmpeg is distributed in the hope that it will be useful,
|
|
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
18 * Lesser General Public License for more details.
|
|
19 *
|
|
20 * You should have received a copy of the GNU Lesser General Public
|
|
21 * License along with FFmpeg; if not, write to the Free Software
|
|
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
23 */
|
|
24
|
|
25
|
|
26 /*
|
|
27 YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
|
|
28 and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
|
|
29
|
|
30
|
|
31 The following calculation is used for the conversion:
|
|
32
|
|
33 r = clipz((y-oy)*cy + crv*(v-128))
|
|
34 g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
|
|
35 b = clipz((y-oy)*cy + cbu*(u-128))
|
|
36
|
|
37 y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
|
|
38
|
|
39
|
|
40 New factorization to eliminate the truncation error which was
|
|
41 occurring due to the byteop3p.
|
|
42
|
|
43
|
|
44 1) Use the bytop16m to subtract quad bytes we use this in U8 this
|
|
45 then so the offsets need to be renormalized to 8bits.
|
|
46
|
|
47 2) Scale operands up by a factor of 4 not 8 because Blackfin
|
|
48 multiplies include a shift.
|
|
49
|
|
50 3) Compute into the accumulators cy*yx0, cy*yx1.
|
|
51
|
|
52 4) Compute each of the linear equations:
|
|
53 r = clipz((y - oy) * cy + crv * (v - 128))
|
|
54
|
|
55 g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
|
|
56
|
|
57 b = clipz((y - oy) * cy + cbu * (u - 128))
|
|
58
|
|
59 Reuse of the accumulators requires that we actually multiply
|
|
60 twice once with addition and the second time with a subtraction.
|
|
61
|
|
62 Because of this we need to compute the equations in the order R B
|
|
63 then G saving the writes for B in the case of 24/32 bit color
|
|
64 formats.
|
|
65
|
|
66 API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
|
|
67 int dW, uint32_t *coeffs);
|
|
68
|
|
69 A B
|
|
70 --- ---
|
|
71 i2 = cb i3 = cr
|
|
72 i1 = coeff i0 = y
|
|
73
|
|
74 Where coeffs have the following layout in memory.
|
|
75
|
|
76 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
|
|
77
|
|
78 coeffs is a pointer to oy.
|
|
79
|
|
80 The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
|
|
81 replication is used to simplify the internal algorithms for the dual Mac
|
|
82 architecture of BlackFin.
|
|
83
|
|
84 All routines are exported with _ff_bfin_ as a symbol prefix.
|
|
85
|
|
86 Rough performance gain compared against -O3:
|
|
87
|
|
88 2779809/1484290 187.28%
|
|
89
|
|
90 which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
|
|
91 c/pel for the optimized implementations. Not sure why there is such a
|
|
92 huge variation on the reference codes on Blackfin I guess it must have
|
|
93 to do with the memory system.
|
|
94 */
|
|
95
|
|
96 #define mL3 .text
|
30430
|
97 #if defined(__FDPIC__) && CONFIG_SRAM
|
29042
|
98 #define mL1 .l1.text
|
|
99 #else
|
|
100 #define mL1 mL3
|
|
101 #endif
|
|
102 #define MEM mL1
|
|
103
|
|
104 #define DEFUN(fname,where,interface) \
|
|
105 .section where; \
|
|
106 .global _ff_bfin_ ## fname; \
|
|
107 .type _ff_bfin_ ## fname, STT_FUNC; \
|
|
108 .align 8; \
|
|
109 _ff_bfin_ ## fname
|
|
110
|
|
111 #define DEFUN_END(fname) \
|
|
112 .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
|
|
113
|
|
114
|
|
115 .text
|
|
116
|
|
117 #define COEFF_LEN 11*4
|
|
118 #define COEFF_REL_CY_OFF 4*4
|
|
119
|
|
120 #define ARG_OUT 20
|
|
121 #define ARG_W 24
|
|
122 #define ARG_COEFF 28
|
|
123
|
|
124 DEFUN(yuv2rgb565_line,MEM,
|
|
125 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
|
|
126 link 0;
|
|
127 [--sp] = (r7:4);
|
|
128 p1 = [fp+ARG_OUT];
|
|
129 r3 = [fp+ARG_W];
|
|
130
|
|
131 i0 = r0;
|
|
132 i2 = r1;
|
|
133 i3 = r2;
|
|
134
|
|
135 r0 = [fp+ARG_COEFF];
|
|
136 i1 = r0;
|
|
137 b1 = i1;
|
|
138 l1 = COEFF_LEN;
|
|
139 m0 = COEFF_REL_CY_OFF;
|
|
140 p0 = r3;
|
|
141
|
|
142 r0 = [i0++]; // 2Y
|
|
143 r1.l = w[i2++]; // 2u
|
|
144 r1.h = w[i3++]; // 2v
|
|
145 p0 = p0>>2;
|
|
146
|
|
147 lsetup (.L0565, .L1565) lc0 = p0;
|
|
148
|
|
149 /*
|
|
150 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
|
|
151 r0 -- used to load 4ys
|
|
152 r1 -- used to load 2us,2vs
|
|
153 r4 -- y3,y2
|
|
154 r5 -- y1,y0
|
|
155 r6 -- u1,u0
|
|
156 r7 -- v1,v0
|
|
157 */
|
|
158 r2=[i1++]; // oy
|
|
159 .L0565:
|
|
160 /*
|
|
161 rrrrrrrr gggggggg bbbbbbbb
|
|
162 5432109876543210
|
|
163 bbbbb >>3
|
|
164 gggggggg <<3
|
|
165 rrrrrrrr <<8
|
|
166 rrrrrggggggbbbbb
|
|
167 */
|
|
168 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
|
|
169 (r7,r6) = byteop16m (r1:0, r3:2) (r);
|
|
170 r5 = r5 << 2 (v); // y1,y0
|
|
171 r4 = r4 << 2 (v); // y3,y2
|
|
172 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
|
|
173 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
|
|
174 /* Y' = y*cy */
|
|
175 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
|
|
176
|
|
177 /* R = Y+ crv*(Cr-128) */
|
|
178 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
|
|
179 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
|
|
180 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
|
|
181 r2 = r2 >> 3 (v);
|
|
182 r3 = r2 & r5;
|
|
183
|
|
184 /* B = Y+ cbu*(Cb-128) */
|
|
185 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
|
|
186 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
|
|
187 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
|
|
188 r2 = r2 << 8 (v);
|
|
189 r2 = r2 & r5;
|
|
190 r3 = r3 | r2;
|
|
191
|
|
192 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
|
|
193 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
|
|
194 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
|
|
195 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
|
|
196 r2 = r2 << 3 (v);
|
|
197 r2 = r2 & r5;
|
|
198 r3 = r3 | r2;
|
|
199 [p1++]=r3 || r1=[i1++]; // cy
|
|
200
|
|
201 /* Y' = y*cy */
|
|
202
|
|
203 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
|
|
204
|
|
205 /* R = Y+ crv*(Cr-128) */
|
|
206 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
|
|
207 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
|
|
208 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
|
|
209 r2 = r2 >> 3 (v);
|
|
210 r3 = r2 & r5;
|
|
211
|
|
212 /* B = Y+ cbu*(Cb-128) */
|
|
213 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
|
|
214 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
|
|
215 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
|
|
216 r2 = r2 << 8 (v);
|
|
217 r2 = r2 & r5;
|
|
218 r3 = r3 | r2;
|
|
219
|
|
220 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
|
|
221 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
|
|
222 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
|
|
223 r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y
|
|
224 r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u
|
|
225 r2 = r2 & r5;
|
|
226 r3 = r3 | r2;
|
|
227 [p1++]=r3 || r1.h = w[i3++]; // 2v
|
|
228 .L1565: r2=[i1++]; // oy
|
|
229
|
|
230 l1 = 0;
|
|
231
|
|
232 (r7:4) = [sp++];
|
|
233 unlink;
|
|
234 rts;
|
|
235 DEFUN_END(yuv2rgb565_line)
|
|
236
|
|
237 DEFUN(yuv2rgb555_line,MEM,
|
|
238 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
|
|
239 link 0;
|
|
240 [--sp] = (r7:4);
|
|
241 p1 = [fp+ARG_OUT];
|
|
242 r3 = [fp+ARG_W];
|
|
243
|
|
244 i0 = r0;
|
|
245 i2 = r1;
|
|
246 i3 = r2;
|
|
247
|
|
248 r0 = [fp+ARG_COEFF];
|
|
249 i1 = r0;
|
|
250 b1 = i1;
|
|
251 l1 = COEFF_LEN;
|
|
252 m0 = COEFF_REL_CY_OFF;
|
|
253 p0 = r3;
|
|
254
|
|
255 r0 = [i0++]; // 2Y
|
|
256 r1.l = w[i2++]; // 2u
|
|
257 r1.h = w[i3++]; // 2v
|
|
258 p0 = p0>>2;
|
|
259
|
|
260 lsetup (.L0555, .L1555) lc0 = p0;
|
|
261
|
|
262 /*
|
|
263 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
|
|
264 r0 -- used to load 4ys
|
|
265 r1 -- used to load 2us,2vs
|
|
266 r4 -- y3,y2
|
|
267 r5 -- y1,y0
|
|
268 r6 -- u1,u0
|
|
269 r7 -- v1,v0
|
|
270 */
|
|
271 r2=[i1++]; // oy
|
|
272 .L0555:
|
|
273 /*
|
|
274 rrrrrrrr gggggggg bbbbbbbb
|
|
275 5432109876543210
|
|
276 bbbbb >>3
|
|
277 gggggggg <<2
|
|
278 rrrrrrrr <<7
|
|
279 xrrrrrgggggbbbbb
|
|
280 */
|
|
281
|
|
282 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
|
|
283 (r7,r6) = byteop16m (r1:0, r3:2) (r);
|
|
284 r5 = r5 << 2 (v); // y1,y0
|
|
285 r4 = r4 << 2 (v); // y3,y2
|
|
286 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
|
|
287 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
|
|
288 /* Y' = y*cy */
|
|
289 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
|
|
290
|
|
291 /* R = Y+ crv*(Cr-128) */
|
|
292 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
|
|
293 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
|
|
294 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
|
|
295 r2 = r2 >> 3 (v);
|
|
296 r3 = r2 & r5;
|
|
297
|
|
298 /* B = Y+ cbu*(Cb-128) */
|
|
299 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
|
|
300 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
|
|
301 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
|
|
302 r2 = r2 << 7 (v);
|
|
303 r2 = r2 & r5;
|
|
304 r3 = r3 | r2;
|
|
305
|
|
306 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
|
|
307 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
|
|
308 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
|
|
309 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
|
|
310 r2 = r2 << 2 (v);
|
|
311 r2 = r2 & r5;
|
|
312 r3 = r3 | r2;
|
|
313 [p1++]=r3 || r1=[i1++]; // cy
|
|
314
|
|
315 /* Y' = y*cy */
|
|
316
|
|
317 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
|
|
318
|
|
319 /* R = Y+ crv*(Cr-128) */
|
|
320 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
|
|
321 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
|
|
322 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
|
|
323 r2 = r2 >> 3 (v);
|
|
324 r3 = r2 & r5;
|
|
325
|
|
326 /* B = Y+ cbu*(Cb-128) */
|
|
327 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
|
|
328 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
|
|
329 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
|
|
330 r2 = r2 << 7 (v);
|
|
331 r2 = r2 & r5;
|
|
332 r3 = r3 | r2;
|
|
333
|
|
334 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
|
|
335 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
|
|
336 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
|
|
337 r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y
|
|
338 r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u
|
|
339 r2 = r2 & r5;
|
|
340 r3 = r3 | r2;
|
|
341 [p1++]=r3 || r1.h=w[i3++]; // 2v
|
|
342
|
|
343 .L1555: r2=[i1++]; // oy
|
|
344
|
|
345 l1 = 0;
|
|
346
|
|
347 (r7:4) = [sp++];
|
|
348 unlink;
|
|
349 rts;
|
|
350 DEFUN_END(yuv2rgb555_line)
|
|
351
|
|
352 DEFUN(yuv2rgb24_line,MEM,
|
|
353 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
|
|
354 link 0;
|
|
355 [--sp] = (r7:4);
|
|
356 p1 = [fp+ARG_OUT];
|
|
357 r3 = [fp+ARG_W];
|
|
358 p2 = p1;
|
|
359 p2 += 3;
|
|
360
|
|
361 i0 = r0;
|
|
362 i2 = r1;
|
|
363 i3 = r2;
|
|
364
|
|
365 r0 = [fp+ARG_COEFF]; // coeff buffer
|
|
366 i1 = r0;
|
|
367 b1 = i1;
|
|
368 l1 = COEFF_LEN;
|
|
369 m0 = COEFF_REL_CY_OFF;
|
|
370 p0 = r3;
|
|
371
|
|
372 r0 = [i0++]; // 2Y
|
|
373 r1.l = w[i2++]; // 2u
|
|
374 r1.h = w[i3++]; // 2v
|
|
375 p0 = p0>>2;
|
|
376
|
|
377 lsetup (.L0888, .L1888) lc0 = p0;
|
|
378
|
|
379 /*
|
|
380 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
|
|
381 r0 -- used to load 4ys
|
|
382 r1 -- used to load 2us,2vs
|
|
383 r4 -- y3,y2
|
|
384 r5 -- y1,y0
|
|
385 r6 -- u1,u0
|
|
386 r7 -- v1,v0
|
|
387 */
|
|
388 r2=[i1++]; // oy
|
|
389 .L0888:
|
|
390 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
|
|
391 (r7,r6) = byteop16m (r1:0, r3:2) (r);
|
|
392 r5 = r5 << 2 (v); // y1,y0
|
|
393 r4 = r4 << 2 (v); // y3,y2
|
|
394 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
|
|
395 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
|
|
396
|
|
397 /* Y' = y*cy */
|
|
398 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
|
|
399
|
|
400 /* R = Y+ crv*(Cr-128) */
|
|
401 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
|
|
402 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
|
|
403 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
|
|
404 r2=r2>>16 || B[p1++]=r2;
|
|
405 B[p2++]=r2;
|
|
406
|
|
407 /* B = Y+ cbu*(Cb-128) */
|
|
408 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
|
|
409 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
|
|
410 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
|
|
411
|
|
412 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
|
|
413 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
|
|
414 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
|
|
415 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero
|
|
416
|
|
417 r2=r2>>16 || B[p1++]=r2;
|
|
418 B[p2++]=r2;
|
|
419
|
|
420 r3=r3>>16 || B[p1++]=r3;
|
|
421 B[p2++]=r3 || r1=[i1++]; // cy
|
|
422
|
|
423 p1+=3;
|
|
424 p2+=3;
|
|
425 /* Y' = y*cy */
|
|
426 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
|
|
427
|
|
428 /* R = Y+ crv*(Cr-128) */
|
|
429 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
|
|
430 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
|
|
431 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
|
|
432 r2=r2>>16 || B[p1++]=r2;
|
|
433 B[p2++]=r2;
|
|
434
|
|
435 /* B = Y+ cbu*(Cb-128) */
|
|
436 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
|
|
437 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
|
|
438 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
|
|
439
|
|
440 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
|
|
441 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
|
|
442 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
|
|
443 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask
|
|
444 r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y
|
|
445 B[p2++]=r2 || r1.l = w[i2++]; // 2u
|
|
446 r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
|
|
447 B[p2++]=r3 || r2=[i1++]; // oy
|
|
448
|
|
449 p1+=3;
|
|
450 .L1888: p2+=3;
|
|
451
|
|
452 l1 = 0;
|
|
453
|
|
454 (r7:4) = [sp++];
|
|
455 unlink;
|
|
456 rts;
|
|
457 DEFUN_END(yuv2rgb24_line)
|
|
458
|
|
459
|
|
460
|
|
461 #define ARG_vdst 20
|
|
462 #define ARG_width 24
|
|
463 #define ARG_height 28
|
|
464 #define ARG_lumStride 32
|
|
465 #define ARG_chromStride 36
|
|
466 #define ARG_srcStride 40
|
|
467
|
|
468 DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
|
469 long width, long height,
|
|
470 long lumStride, long chromStride, long srcStride)):
|
|
471 link 0;
|
|
472 [--sp] = (r7:4,p5:4);
|
|
473
|
|
474 p0 = r1; // Y top even
|
|
475
|
|
476 i2 = r2; // *u
|
|
477 r2 = [fp + ARG_vdst];
|
|
478 i3 = r2; // *v
|
|
479
|
|
480 r1 = [fp + ARG_srcStride];
|
|
481 r2 = r0 + r1;
|
|
482 r1 += -8; // i0,i1 is pre read need to correct
|
|
483 m0 = r1;
|
|
484
|
|
485 i0 = r0; // uyvy_T even
|
|
486 i1 = r2; // uyvy_B odd
|
|
487
|
|
488 p2 = [fp + ARG_lumStride];
|
|
489 p1 = p0 + p2; // Y bot odd
|
|
490
|
|
491 p5 = [fp + ARG_width];
|
|
492 p4 = [fp + ARG_height];
|
|
493 r0 = p5;
|
|
494 p4 = p4 >> 1;
|
|
495 p5 = p5 >> 2;
|
|
496
|
|
497 r2 = [fp + ARG_chromStride];
|
|
498 r0 = r0 >> 1;
|
|
499 r2 = r2 - r0;
|
|
500 m1 = r2;
|
|
501
|
|
502 /* I0,I1 - src input line pointers
|
|
503 * p0,p1 - luma output line pointers
|
|
504 * I2 - dstU
|
|
505 * I3 - dstV
|
|
506 */
|
|
507
|
|
508 lsetup (0f, 1f) lc1 = p4; // H/2
|
|
509 0: r0 = [i0++] || r2 = [i1++];
|
|
510 r1 = [i0++] || r3 = [i1++];
|
|
511 r4 = byteop1p(r1:0, r3:2);
|
|
512 r5 = byteop1p(r1:0, r3:2) (r);
|
|
513 lsetup (2f, 3f) lc0 = p5; // W/4
|
|
514 2: r0 = r0 >> 8(v);
|
|
515 r1 = r1 >> 8(v);
|
|
516 r2 = r2 >> 8(v);
|
|
517 r3 = r3 >> 8(v);
|
|
518 r0 = bytepack(r0, r1);
|
|
519 r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy
|
|
520 r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy
|
|
521 r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
|
|
522 r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
|
|
523 r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu
|
|
524 3: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv
|
|
525
|
|
526 i0 += m0;
|
|
527 i1 += m0;
|
|
528 i2 += m1;
|
|
529 i3 += m1;
|
|
530 p0 = p0 + p2;
|
|
531 1: p1 = p1 + p2;
|
|
532
|
|
533 (r7:4,p5:4) = [sp++];
|
|
534 unlink;
|
|
535 rts;
|
|
536 DEFUN_END(uyvytoyv12)
|
|
537
|
|
538 DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
|
539 long width, long height,
|
|
540 long lumStride, long chromStride, long srcStride)):
|
|
541 link 0;
|
|
542 [--sp] = (r7:4,p5:4);
|
|
543
|
|
544 p0 = r1; // Y top even
|
|
545
|
|
546 i2 = r2; // *u
|
|
547 r2 = [fp + ARG_vdst];
|
|
548 i3 = r2; // *v
|
|
549
|
|
550 r1 = [fp + ARG_srcStride];
|
|
551 r2 = r0 + r1;
|
|
552 r1 += -8; // i0,i1 is pre read need to correct
|
|
553 m0 = r1;
|
|
554
|
|
555 i0 = r0; // uyvy_T even
|
|
556 i1 = r2; // uyvy_B odd
|
|
557
|
|
558 p2 = [fp + ARG_lumStride];
|
|
559 p1 = p0 + p2; // Y bot odd
|
|
560
|
|
561 p5 = [fp + ARG_width];
|
|
562 p4 = [fp + ARG_height];
|
|
563 r0 = p5;
|
|
564 p4 = p4 >> 1;
|
|
565 p5 = p5 >> 2;
|
|
566
|
|
567 r2 = [fp + ARG_chromStride];
|
|
568 r0 = r0 >> 1;
|
|
569 r2 = r2 - r0;
|
|
570 m1 = r2;
|
|
571
|
|
572 /* I0,I1 - src input line pointers
|
|
573 * p0,p1 - luma output line pointers
|
|
574 * I2 - dstU
|
|
575 * I3 - dstV
|
|
576 */
|
|
577
|
|
578 lsetup (0f, 1f) lc1 = p4; // H/2
|
|
579 0: r0 = [i0++] || r2 = [i1++];
|
|
580 r1 = [i0++] || r3 = [i1++];
|
|
581 r4 = bytepack(r0, r1);
|
|
582 r5 = bytepack(r2, r3);
|
|
583 lsetup (2f, 3f) lc0 = p5; // W/4
|
|
584 2: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even
|
|
585 r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd
|
|
586 r2 = r2 >> 8(v);
|
|
587 r3 = r3 >> 8(v);
|
|
588 r4 = byteop1p(r1:0, r3:2);
|
|
589 r5 = byteop1p(r1:0, r3:2) (r);
|
|
590 r6 = pack(r5.l, r4.l);
|
|
591 r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
|
|
592 r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
|
|
593 r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu
|
|
594 3: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv
|
|
595
|
|
596 i0 += m0;
|
|
597 i1 += m0;
|
|
598 i2 += m1;
|
|
599 i3 += m1;
|
|
600 p0 = p0 + p2;
|
|
601 1: p1 = p1 + p2;
|
|
602
|
|
603 (r7:4,p5:4) = [sp++];
|
|
604 unlink;
|
|
605 rts;
|
|
606 DEFUN_END(yuyvtoyv12)
|