comparison libswscale/bfin/internal_bfin.S @ 29042:c198fbb7172e

Move bfin specific code to its subdir.
author ramiro
date Fri, 27 Mar 2009 22:56:56 +0000
parents
children cdb6693dd949
comparison
equal deleted inserted replaced
29041:1b8e2b6fa928 29042:c198fbb7172e
1 /*
2 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3 * April 20, 2007
4 *
5 * Blackfin video color space converter operations
6 * convert I420 YV12 to RGB in various formats
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25
26 /*
27 YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28 and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
29
30
31 The following calculation is used for the conversion:
32
33 r = clipz((y-oy)*cy + crv*(v-128))
34 g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
35 b = clipz((y-oy)*cy + cbu*(u-128))
36
37 y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
38
39
40 New factorization to eliminate the truncation error which was
41 occurring due to the byteop3p.
42
43
44 1) Use the bytop16m to subtract quad bytes we use this in U8 this
45 then so the offsets need to be renormalized to 8bits.
46
47 2) Scale operands up by a factor of 4 not 8 because Blackfin
48 multiplies include a shift.
49
50 3) Compute into the accumulators cy*yx0, cy*yx1.
51
52 4) Compute each of the linear equations:
53 r = clipz((y - oy) * cy + crv * (v - 128))
54
55 g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
56
57 b = clipz((y - oy) * cy + cbu * (u - 128))
58
59 Reuse of the accumulators requires that we actually multiply
60 twice once with addition and the second time with a subtraction.
61
62 Because of this we need to compute the equations in the order R B
63 then G saving the writes for B in the case of 24/32 bit color
64 formats.
65
66 API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67 int dW, uint32_t *coeffs);
68
69 A B
70 --- ---
71 i2 = cb i3 = cr
72 i1 = coeff i0 = y
73
74 Where coeffs have the following layout in memory.
75
76 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
77
78 coeffs is a pointer to oy.
79
80 The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81 replication is used to simplify the internal algorithms for the dual Mac
82 architecture of BlackFin.
83
84 All routines are exported with _ff_bfin_ as a symbol prefix.
85
86 Rough performance gain compared against -O3:
87
88 2779809/1484290 187.28%
89
90 which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91 c/pel for the optimized implementations. Not sure why there is such a
92 huge variation on the reference codes on Blackfin I guess it must have
93 to do with the memory system.
94 */
95
96 #define mL3 .text
97 #ifdef __FDPIC__
98 #define mL1 .l1.text
99 #else
100 #define mL1 mL3
101 #endif
102 #define MEM mL1
103
104 #define DEFUN(fname,where,interface) \
105 .section where; \
106 .global _ff_bfin_ ## fname; \
107 .type _ff_bfin_ ## fname, STT_FUNC; \
108 .align 8; \
109 _ff_bfin_ ## fname
110
111 #define DEFUN_END(fname) \
112 .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
113
114
115 .text
116
117 #define COEFF_LEN 11*4
118 #define COEFF_REL_CY_OFF 4*4
119
120 #define ARG_OUT 20
121 #define ARG_W 24
122 #define ARG_COEFF 28
123
124 DEFUN(yuv2rgb565_line,MEM,
125 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
126 link 0;
127 [--sp] = (r7:4);
128 p1 = [fp+ARG_OUT];
129 r3 = [fp+ARG_W];
130
131 i0 = r0;
132 i2 = r1;
133 i3 = r2;
134
135 r0 = [fp+ARG_COEFF];
136 i1 = r0;
137 b1 = i1;
138 l1 = COEFF_LEN;
139 m0 = COEFF_REL_CY_OFF;
140 p0 = r3;
141
142 r0 = [i0++]; // 2Y
143 r1.l = w[i2++]; // 2u
144 r1.h = w[i3++]; // 2v
145 p0 = p0>>2;
146
147 lsetup (.L0565, .L1565) lc0 = p0;
148
149 /*
150 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
151 r0 -- used to load 4ys
152 r1 -- used to load 2us,2vs
153 r4 -- y3,y2
154 r5 -- y1,y0
155 r6 -- u1,u0
156 r7 -- v1,v0
157 */
158 r2=[i1++]; // oy
159 .L0565:
160 /*
161 rrrrrrrr gggggggg bbbbbbbb
162 5432109876543210
163 bbbbb >>3
164 gggggggg <<3
165 rrrrrrrr <<8
166 rrrrrggggggbbbbb
167 */
168 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
169 (r7,r6) = byteop16m (r1:0, r3:2) (r);
170 r5 = r5 << 2 (v); // y1,y0
171 r4 = r4 << 2 (v); // y3,y2
172 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
173 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
174 /* Y' = y*cy */
175 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
176
177 /* R = Y+ crv*(Cr-128) */
178 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
179 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
180 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
181 r2 = r2 >> 3 (v);
182 r3 = r2 & r5;
183
184 /* B = Y+ cbu*(Cb-128) */
185 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
186 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
187 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
188 r2 = r2 << 8 (v);
189 r2 = r2 & r5;
190 r3 = r3 | r2;
191
192 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
193 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
194 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
195 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
196 r2 = r2 << 3 (v);
197 r2 = r2 & r5;
198 r3 = r3 | r2;
199 [p1++]=r3 || r1=[i1++]; // cy
200
201 /* Y' = y*cy */
202
203 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
204
205 /* R = Y+ crv*(Cr-128) */
206 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
207 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
208 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
209 r2 = r2 >> 3 (v);
210 r3 = r2 & r5;
211
212 /* B = Y+ cbu*(Cb-128) */
213 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
214 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
215 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
216 r2 = r2 << 8 (v);
217 r2 = r2 & r5;
218 r3 = r3 | r2;
219
220 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
221 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
222 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
223 r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y
224 r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u
225 r2 = r2 & r5;
226 r3 = r3 | r2;
227 [p1++]=r3 || r1.h = w[i3++]; // 2v
228 .L1565: r2=[i1++]; // oy
229
230 l1 = 0;
231
232 (r7:4) = [sp++];
233 unlink;
234 rts;
235 DEFUN_END(yuv2rgb565_line)
236
237 DEFUN(yuv2rgb555_line,MEM,
238 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
239 link 0;
240 [--sp] = (r7:4);
241 p1 = [fp+ARG_OUT];
242 r3 = [fp+ARG_W];
243
244 i0 = r0;
245 i2 = r1;
246 i3 = r2;
247
248 r0 = [fp+ARG_COEFF];
249 i1 = r0;
250 b1 = i1;
251 l1 = COEFF_LEN;
252 m0 = COEFF_REL_CY_OFF;
253 p0 = r3;
254
255 r0 = [i0++]; // 2Y
256 r1.l = w[i2++]; // 2u
257 r1.h = w[i3++]; // 2v
258 p0 = p0>>2;
259
260 lsetup (.L0555, .L1555) lc0 = p0;
261
262 /*
263 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
264 r0 -- used to load 4ys
265 r1 -- used to load 2us,2vs
266 r4 -- y3,y2
267 r5 -- y1,y0
268 r6 -- u1,u0
269 r7 -- v1,v0
270 */
271 r2=[i1++]; // oy
272 .L0555:
273 /*
274 rrrrrrrr gggggggg bbbbbbbb
275 5432109876543210
276 bbbbb >>3
277 gggggggg <<2
278 rrrrrrrr <<7
279 xrrrrrgggggbbbbb
280 */
281
282 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
283 (r7,r6) = byteop16m (r1:0, r3:2) (r);
284 r5 = r5 << 2 (v); // y1,y0
285 r4 = r4 << 2 (v); // y3,y2
286 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
287 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
288 /* Y' = y*cy */
289 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
290
291 /* R = Y+ crv*(Cr-128) */
292 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
293 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
294 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
295 r2 = r2 >> 3 (v);
296 r3 = r2 & r5;
297
298 /* B = Y+ cbu*(Cb-128) */
299 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
300 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
301 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
302 r2 = r2 << 7 (v);
303 r2 = r2 & r5;
304 r3 = r3 | r2;
305
306 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
307 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
308 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
309 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
310 r2 = r2 << 2 (v);
311 r2 = r2 & r5;
312 r3 = r3 | r2;
313 [p1++]=r3 || r1=[i1++]; // cy
314
315 /* Y' = y*cy */
316
317 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
318
319 /* R = Y+ crv*(Cr-128) */
320 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
321 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
322 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
323 r2 = r2 >> 3 (v);
324 r3 = r2 & r5;
325
326 /* B = Y+ cbu*(Cb-128) */
327 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
328 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
329 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
330 r2 = r2 << 7 (v);
331 r2 = r2 & r5;
332 r3 = r3 | r2;
333
334 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
335 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
336 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
337 r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y
338 r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u
339 r2 = r2 & r5;
340 r3 = r3 | r2;
341 [p1++]=r3 || r1.h=w[i3++]; // 2v
342
343 .L1555: r2=[i1++]; // oy
344
345 l1 = 0;
346
347 (r7:4) = [sp++];
348 unlink;
349 rts;
350 DEFUN_END(yuv2rgb555_line)
351
352 DEFUN(yuv2rgb24_line,MEM,
353 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
354 link 0;
355 [--sp] = (r7:4);
356 p1 = [fp+ARG_OUT];
357 r3 = [fp+ARG_W];
358 p2 = p1;
359 p2 += 3;
360
361 i0 = r0;
362 i2 = r1;
363 i3 = r2;
364
365 r0 = [fp+ARG_COEFF]; // coeff buffer
366 i1 = r0;
367 b1 = i1;
368 l1 = COEFF_LEN;
369 m0 = COEFF_REL_CY_OFF;
370 p0 = r3;
371
372 r0 = [i0++]; // 2Y
373 r1.l = w[i2++]; // 2u
374 r1.h = w[i3++]; // 2v
375 p0 = p0>>2;
376
377 lsetup (.L0888, .L1888) lc0 = p0;
378
379 /*
380 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
381 r0 -- used to load 4ys
382 r1 -- used to load 2us,2vs
383 r4 -- y3,y2
384 r5 -- y1,y0
385 r6 -- u1,u0
386 r7 -- v1,v0
387 */
388 r2=[i1++]; // oy
389 .L0888:
390 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
391 (r7,r6) = byteop16m (r1:0, r3:2) (r);
392 r5 = r5 << 2 (v); // y1,y0
393 r4 = r4 << 2 (v); // y3,y2
394 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
395 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
396
397 /* Y' = y*cy */
398 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
399
400 /* R = Y+ crv*(Cr-128) */
401 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
402 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
403 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
404 r2=r2>>16 || B[p1++]=r2;
405 B[p2++]=r2;
406
407 /* B = Y+ cbu*(Cb-128) */
408 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
409 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
410 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
411
412 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
413 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
414 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
415 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero
416
417 r2=r2>>16 || B[p1++]=r2;
418 B[p2++]=r2;
419
420 r3=r3>>16 || B[p1++]=r3;
421 B[p2++]=r3 || r1=[i1++]; // cy
422
423 p1+=3;
424 p2+=3;
425 /* Y' = y*cy */
426 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
427
428 /* R = Y+ crv*(Cr-128) */
429 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
430 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
431 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
432 r2=r2>>16 || B[p1++]=r2;
433 B[p2++]=r2;
434
435 /* B = Y+ cbu*(Cb-128) */
436 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
437 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
438 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
439
440 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
441 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
442 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
443 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask
444 r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y
445 B[p2++]=r2 || r1.l = w[i2++]; // 2u
446 r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
447 B[p2++]=r3 || r2=[i1++]; // oy
448
449 p1+=3;
450 .L1888: p2+=3;
451
452 l1 = 0;
453
454 (r7:4) = [sp++];
455 unlink;
456 rts;
457 DEFUN_END(yuv2rgb24_line)
458
459
460
461 #define ARG_vdst 20
462 #define ARG_width 24
463 #define ARG_height 28
464 #define ARG_lumStride 32
465 #define ARG_chromStride 36
466 #define ARG_srcStride 40
467
468 DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
469 long width, long height,
470 long lumStride, long chromStride, long srcStride)):
471 link 0;
472 [--sp] = (r7:4,p5:4);
473
474 p0 = r1; // Y top even
475
476 i2 = r2; // *u
477 r2 = [fp + ARG_vdst];
478 i3 = r2; // *v
479
480 r1 = [fp + ARG_srcStride];
481 r2 = r0 + r1;
482 r1 += -8; // i0,i1 is pre read need to correct
483 m0 = r1;
484
485 i0 = r0; // uyvy_T even
486 i1 = r2; // uyvy_B odd
487
488 p2 = [fp + ARG_lumStride];
489 p1 = p0 + p2; // Y bot odd
490
491 p5 = [fp + ARG_width];
492 p4 = [fp + ARG_height];
493 r0 = p5;
494 p4 = p4 >> 1;
495 p5 = p5 >> 2;
496
497 r2 = [fp + ARG_chromStride];
498 r0 = r0 >> 1;
499 r2 = r2 - r0;
500 m1 = r2;
501
502 /* I0,I1 - src input line pointers
503 * p0,p1 - luma output line pointers
504 * I2 - dstU
505 * I3 - dstV
506 */
507
508 lsetup (0f, 1f) lc1 = p4; // H/2
509 0: r0 = [i0++] || r2 = [i1++];
510 r1 = [i0++] || r3 = [i1++];
511 r4 = byteop1p(r1:0, r3:2);
512 r5 = byteop1p(r1:0, r3:2) (r);
513 lsetup (2f, 3f) lc0 = p5; // W/4
514 2: r0 = r0 >> 8(v);
515 r1 = r1 >> 8(v);
516 r2 = r2 >> 8(v);
517 r3 = r3 >> 8(v);
518 r0 = bytepack(r0, r1);
519 r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy
520 r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy
521 r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
522 r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
523 r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu
524 3: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv
525
526 i0 += m0;
527 i1 += m0;
528 i2 += m1;
529 i3 += m1;
530 p0 = p0 + p2;
531 1: p1 = p1 + p2;
532
533 (r7:4,p5:4) = [sp++];
534 unlink;
535 rts;
536 DEFUN_END(uyvytoyv12)
537
538 DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
539 long width, long height,
540 long lumStride, long chromStride, long srcStride)):
541 link 0;
542 [--sp] = (r7:4,p5:4);
543
544 p0 = r1; // Y top even
545
546 i2 = r2; // *u
547 r2 = [fp + ARG_vdst];
548 i3 = r2; // *v
549
550 r1 = [fp + ARG_srcStride];
551 r2 = r0 + r1;
552 r1 += -8; // i0,i1 is pre read need to correct
553 m0 = r1;
554
555 i0 = r0; // uyvy_T even
556 i1 = r2; // uyvy_B odd
557
558 p2 = [fp + ARG_lumStride];
559 p1 = p0 + p2; // Y bot odd
560
561 p5 = [fp + ARG_width];
562 p4 = [fp + ARG_height];
563 r0 = p5;
564 p4 = p4 >> 1;
565 p5 = p5 >> 2;
566
567 r2 = [fp + ARG_chromStride];
568 r0 = r0 >> 1;
569 r2 = r2 - r0;
570 m1 = r2;
571
572 /* I0,I1 - src input line pointers
573 * p0,p1 - luma output line pointers
574 * I2 - dstU
575 * I3 - dstV
576 */
577
578 lsetup (0f, 1f) lc1 = p4; // H/2
579 0: r0 = [i0++] || r2 = [i1++];
580 r1 = [i0++] || r3 = [i1++];
581 r4 = bytepack(r0, r1);
582 r5 = bytepack(r2, r3);
583 lsetup (2f, 3f) lc0 = p5; // W/4
584 2: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even
585 r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd
586 r2 = r2 >> 8(v);
587 r3 = r3 >> 8(v);
588 r4 = byteop1p(r1:0, r3:2);
589 r5 = byteop1p(r1:0, r3:2) (r);
590 r6 = pack(r5.l, r4.l);
591 r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
592 r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
593 r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu
594 3: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv
595
596 i0 += m0;
597 i1 += m0;
598 i2 += m1;
599 i3 += m1;
600 p0 = p0 + p2;
601 1: p1 = p1 + p2;
602
603 (r7:4,p5:4) = [sp++];
604 unlink;
605 rts;
606 DEFUN_END(yuyvtoyv12)