comparison libswscale/internal_bfin.S @ 23289:786823779b67

Blackfin optimized YUV420 to RGB CSC Color Space Converters. YUV2 -> RGB BGR for 565, 555 and 888 a.k.a. 24bit color. Speed-up compared to C version compiled with -O3 187.28% Patch by Marc Hoffman %mmh A pleasantst P com% Original thread: Date: May 9, 2007 2:46 AM Subject: [FFmpeg-devel] PATCH BlackFin yuv2rgb color space conversion
author gpoirier
date Sun, 13 May 2007 19:22:32 +0000
parents
children d7af89e69e92
comparison
equal deleted inserted replaced
23288:3e0d632ec805 23289:786823779b67
1 /*
2 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3 * April 20, 2007
4 *
5 * Blackfin Video Color Space Converters Operations
6 * convert I420 YV12 to RGB in various formats,
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25
26 /*
27 YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28 and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts
29
30
31 The following calculation is used for the conversion:
32
33 r = clipz((y-oy)*cy + crv*(v-128))
34 g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
35 b = clipz((y-oy)*cy + cbu*(u-128))
36
37 y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
38
39
40 New factorization to elliminate the truncation error which was
41 occuring due to the byteop3p.
42
43
44 1) use the bytop16m to subtract quad bytes we use this in U8 this
45 then so the offsets need to be renormalized to 8bits.
46
47 2) scale operands up by a factor of 4 not 8 because Blackfin
48 multiplies include a shift.
49
50 3) compute into the accumulators cy*yx0, cy*yx1
51
52 4) compute each of the linear equations
53 r = clipz((y-oy)*cy + crv*(v-128))
54
55 g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
56
57 b = clipz((y-oy)*cy + cbu*(u-128))
58
59 reuse of the accumulators requires that we actually multiply
60 twice once with addition and the second time with a subtaction.
61
62 because of this we need to compute the equations in the order R B
63 then G saving the writes for B in the case of 24/32 bit color
64 formats.
65
66 api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67 int dW, uint32_t *coeffs);
68
69 A B
70 --- ---
71 i2 = cb i3 = cr
72 i1 = coeff i0 = y
73
74 Where coeffs have the following layout in memory.
75
76 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
77
78 coeffs is a pointer to oy.
79
80 the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81 replication is used to simplify the internal algorithms for the dual mac architecture
82 of BlackFin.
83
84 All routines are exported with _ff_bfin_ as a symbol prefix
85
86 rough performance gain compared against -O3:
87
88 2779809/1484290 187.28%
89
90 which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91 c/pel for the optimized implementations. Not sure why there is such a
92 huge variation on the reference codes on Blackfin I guess it must have
93 to do with the memory system.
94
95 */
96
97 #define mL1 .l1.text
98 #define mL3 .text
99 #define MEM mL1
100
101 #define DEFUN(fname,where,interface) \
102 .section where; \
103 .global _ff_bfin_ ## fname; \
104 .type _ff_bfin_ ## fname, STT_FUNC; \
105 .align 8; \
106 _ff_bfin_ ## fname
107
108 #define DEFUN_END(fname) \
109 .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
110
111
112 .text
113
114 #define COEFF_LEN 11*4
115 #define COEFF_REL_CY_OFF 4*4
116
117 #define ARG_OUT 20
118 #define ARG_W 24
119 #define ARG_COEFF 28
120
121 DEFUN(yuv2rgb565_line,MEM,
122 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
123 link 0;
124 [--sp] = (r7:4);
125 p1 = [fp+ARG_OUT];
126 r3 = [fp+ARG_W];
127
128 i0 = r0;
129 i2 = r1;
130 i3 = r2;
131
132 r0 = [fp+ARG_COEFF];
133 i1 = r0;
134 b1 = i1;
135 l1 = COEFF_LEN;
136 m0 = COEFF_REL_CY_OFF;
137 p0 = r3;
138
139 r0 = [i0++]; // 2Y
140 r1.l = w[i2++]; // 2u
141 r1.h = w[i3++]; // 2v
142 p0 = p0>>2;
143
144 lsetup (.L0565, .L1565) lc0 = p0;
145
146 /*
147 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
148 r0 -- used to load 4ys
149 r1 -- used to load 2us,2vs
150 r4 -- y3,y2
151 r5 -- y1,y0
152 r6 -- u1,u0
153 r7 -- v1,v0
154 */
155 r2=[i1++]; // oy
156 .L0565:
157 /*
158 rrrrrrrr gggggggg bbbbbbbb
159 5432109876543210
160 bbbbb >>3
161 gggggggg <<3
162 rrrrrrrr <<8
163 rrrrrggggggbbbbb
164 */
165 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
166 (r7,r6) = byteop16m (r1:0, r3:2) (r);
167 r5 = r5 << 2 (v); // y1,y0
168 r4 = r4 << 2 (v); // y3,y2
169 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
170 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
171 /* Y' = y*cy */
172 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
173
174 /* R = Y+ crv*(Cr-128) */
175 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
176 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
177 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
178 r2 = r2 >> 3 (v);
179 r3 = r2 & r5;
180
181 /* B = Y+ cbu*(Cb-128) */
182 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
183 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
184 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
185 r2 = r2 << 8 (v);
186 r2 = r2 & r5;
187 r3 = r3 | r2;
188
189 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
190 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
191 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
192 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
193 r2 = r2 << 3 (v);
194 r2 = r2 & r5;
195 r3 = r3 | r2;
196 [p1++]=r3 || r1=[i1++]; // cy
197
198 /* Y' = y*cy */
199
200 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
201
202 /* R = Y+ crv*(Cr-128) */
203 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
204 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
205 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
206 r2 = r2 >> 3 (v);
207 r3 = r2 & r5;
208
209 /* B = Y+ cbu*(Cb-128) */
210 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
211 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
212 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
213 r2 = r2 << 8 (v);
214 r2 = r2 & r5;
215 r3 = r3 | r2;
216
217 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
218 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
219 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
220 r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y
221 r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u
222 r2 = r2 & r5;
223 r3 = r3 | r2;
224 [p1++]=r3 || r1.h = w[i3++]; // 2v
225 .L1565: r2=[i1++]; // oy
226
227 l1 = 0;
228
229 (r7:4) = [sp++];
230 unlink;
231 rts;
232 DEFUN_END(yuv2rgb565_line)
233
234 DEFUN(yuv2rgb555_line,MEM,
235 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
236 link 0;
237 [--sp] = (r7:4);
238 p1 = [fp+ARG_OUT];
239 r3 = [fp+ARG_W];
240
241 i0 = r0;
242 i2 = r1;
243 i3 = r2;
244
245 r0 = [fp+ARG_COEFF];
246 i1 = r0;
247 b1 = i1;
248 l1 = COEFF_LEN;
249 m0 = COEFF_REL_CY_OFF;
250 p0 = r3;
251
252 r0 = [i0++]; // 2Y
253 r1.l = w[i2++]; // 2u
254 r1.h = w[i3++]; // 2v
255 p0 = p0>>2;
256
257 lsetup (.L0555, .L1555) lc0 = p0;
258
259 /*
260 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
261 r0 -- used to load 4ys
262 r1 -- used to load 2us,2vs
263 r4 -- y3,y2
264 r5 -- y1,y0
265 r6 -- u1,u0
266 r7 -- v1,v0
267 */
268 r2=[i1++]; // oy
269 .L0555:
270 /*
271 rrrrrrrr gggggggg bbbbbbbb
272 5432109876543210
273 bbbbb >>3
274 gggggggg <<2
275 rrrrrrrr <<7
276 xrrrrrgggggbbbbb
277 */
278
279 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
280 (r7,r6) = byteop16m (r1:0, r3:2) (r);
281 r5 = r5 << 2 (v); // y1,y0
282 r4 = r4 << 2 (v); // y3,y2
283 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
284 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
285 /* Y' = y*cy */
286 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
287
288 /* R = Y+ crv*(Cr-128) */
289 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
290 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
291 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
292 r2 = r2 >> 3 (v);
293 r3 = r2 & r5;
294
295 /* B = Y+ cbu*(Cb-128) */
296 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
297 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
298 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
299 r2 = r2 << 7 (v);
300 r2 = r2 & r5;
301 r3 = r3 | r2;
302
303 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
304 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
305 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
306 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
307 r2 = r2 << 2 (v);
308 r2 = r2 & r5;
309 r3 = r3 | r2;
310 [p1++]=r3 || r1=[i1++]; // cy
311
312 /* Y' = y*cy */
313
314 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
315
316 /* R = Y+ crv*(Cr-128) */
317 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
318 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
319 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
320 r2 = r2 >> 3 (v);
321 r3 = r2 & r5;
322
323 /* B = Y+ cbu*(Cb-128) */
324 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
325 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
326 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
327 r2 = r2 << 7 (v);
328 r2 = r2 & r5;
329 r3 = r3 | r2;
330
331 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
332 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
333 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
334 r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y
335 r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u
336 r2 = r2 & r5;
337 r3 = r3 | r2;
338 [p1++]=r3 || r1.h=w[i3++]; // 2v
339
340 .L1555: r2=[i1++]; // oy
341
342 l1 = 0;
343
344 (r7:4) = [sp++];
345 unlink;
346 rts;
347 DEFUN_END(yuv2rgb555_line)
348
349 DEFUN(yuv2rgb24_line,MEM,
350 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
351 link 0;
352 [--sp] = (r7:4);
353 p1 = [fp+ARG_OUT];
354 r3 = [fp+ARG_W];
355 p2 = p1;
356 p2 += 3;
357
358 i0 = r0;
359 i2 = r1;
360 i3 = r2;
361
362 r0 = [fp+ARG_COEFF]; // coeff buffer
363 i1 = r0;
364 b1 = i1;
365 l1 = COEFF_LEN;
366 m0 = COEFF_REL_CY_OFF;
367 p0 = r3;
368
369 r0 = [i0++]; // 2Y
370 r1.l = w[i2++]; // 2u
371 r1.h = w[i3++]; // 2v
372 p0 = p0>>2;
373
374 lsetup (.L0888, .L1888) lc0 = p0;
375
376 /*
377 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
378 r0 -- used to load 4ys
379 r1 -- used to load 2us,2vs
380 r4 -- y3,y2
381 r5 -- y1,y0
382 r6 -- u1,u0
383 r7 -- v1,v0
384 */
385 r2=[i1++]; // oy
386 .L0888:
387 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
388 (r7,r6) = byteop16m (r1:0, r3:2) (r);
389 r5 = r5 << 2 (v); // y1,y0
390 r4 = r4 << 2 (v); // y3,y2
391 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
392 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
393
394 /* Y' = y*cy */
395 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
396
397 /* R = Y+ crv*(Cr-128) */
398 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
399 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
400 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
401 r2=r2>>16 || B[p1++]=r2;
402 B[p2++]=r2;
403
404 /* B = Y+ cbu*(Cb-128) */
405 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
406 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
407 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
408
409 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
410 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
411 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
412 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero
413
414 r2=r2>>16 || B[p1++]=r2;
415 B[p2++]=r2;
416
417 r3=r3>>16 || B[p1++]=r3;
418 B[p2++]=r3 || r1=[i1++]; // cy
419
420 p1+=3;
421 p2+=3;
422 /* Y' = y*cy */
423 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
424
425 /* R = Y+ crv*(Cr-128) */
426 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
427 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
428 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
429 r2=r2>>16 || B[p1++]=r2;
430 B[p2++]=r2;
431
432 /* B = Y+ cbu*(Cb-128) */
433 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
434 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
435 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
436
437 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
438 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
439 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
440 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask
441 r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y
442 B[p2++]=r2 || r1.l = w[i2++]; // 2u
443 r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
444 B[p2++]=r3 || r2=[i1++]; // oy
445
446 p1+=3;
447 .L1888: p2+=3;
448
449 l1 = 0;
450
451 (r7:4) = [sp++];
452 unlink;
453 rts;
454 DEFUN_END(yuv2rgb888_line)