Mercurial > mplayer.hg
comparison libswscale/internal_bfin.S @ 23289:786823779b67
Blackfin optimized YUV420 to RGB CSC Color Space Converters.
YUV2 -> RGB BGR for 565, 555 and 888 a.k.a. 24bit color.
Speed-up compared to C version compiled with -O3 187.28%
Patch by Marc Hoffman %mmh A pleasantst P com%
Original thread:
Date: May 9, 2007 2:46 AM
Subject: [FFmpeg-devel] PATCH BlackFin yuv2rgb color space conversion
author | gpoirier |
---|---|
date | Sun, 13 May 2007 19:22:32 +0000 |
parents | |
children | d7af89e69e92 |
comparison
equal
deleted
inserted
replaced
23288:3e0d632ec805 | 23289:786823779b67 |
---|---|
1 /* | |
2 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> | |
3 * April 20, 2007 | |
4 * | |
5 * Blackfin Video Color Space Converters Operations | |
6 * convert I420 YV12 to RGB in various formats, | |
7 * | |
8 * This file is part of FFmpeg. | |
9 * | |
10 * FFmpeg is free software; you can redistribute it and/or | |
11 * modify it under the terms of the GNU Lesser General Public | |
12 * License as published by the Free Software Foundation; either | |
13 * version 2.1 of the License, or (at your option) any later version. | |
14 * | |
15 * FFmpeg is distributed in the hope that it will be useful, | |
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 * Lesser General Public License for more details. | |
19 * | |
20 * You should have received a copy of the GNU Lesser General Public | |
21 * License along with FFmpeg; if not, write to the Free Software | |
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 */ | |
24 | |
25 | |
26 /* | |
27 YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock | |
28 and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts | |
29 | |
30 | |
31 The following calculation is used for the conversion: | |
32 | |
33 r = clipz((y-oy)*cy + crv*(v-128)) | |
34 g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128)) | |
35 b = clipz((y-oy)*cy + cbu*(u-128)) | |
36 | |
37 y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision. | |
38 | |
39 | |
40 New factorization to elliminate the truncation error which was | |
41 occuring due to the byteop3p. | |
42 | |
43 | |
44 1) use the bytop16m to subtract quad bytes we use this in U8 this | |
45 then so the offsets need to be renormalized to 8bits. | |
46 | |
47 2) scale operands up by a factor of 4 not 8 because Blackfin | |
48 multiplies include a shift. | |
49 | |
50 3) compute into the accumulators cy*yx0, cy*yx1 | |
51 | |
52 4) compute each of the linear equations | |
53 r = clipz((y-oy)*cy + crv*(v-128)) | |
54 | |
55 g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128)) | |
56 | |
57 b = clipz((y-oy)*cy + cbu*(u-128)) | |
58 | |
59 reuse of the accumulators requires that we actually multiply | |
60 twice once with addition and the second time with a subtaction. | |
61 | |
62 because of this we need to compute the equations in the order R B | |
63 then G saving the writes for B in the case of 24/32 bit color | |
64 formats. | |
65 | |
66 api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, | |
67 int dW, uint32_t *coeffs); | |
68 | |
69 A B | |
70 --- --- | |
71 i2 = cb i3 = cr | |
72 i1 = coeff i0 = y | |
73 | |
74 Where coeffs have the following layout in memory. | |
75 | |
76 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv; | |
77 | |
78 coeffs is a pointer to oy. | |
79 | |
80 the {rgb} masks are only utilized by the 565 packing algorithm. Note the data | |
81 replication is used to simplify the internal algorithms for the dual mac architecture | |
82 of BlackFin. | |
83 | |
84 All routines are exported with _ff_bfin_ as a symbol prefix | |
85 | |
86 rough performance gain compared against -O3: | |
87 | |
88 2779809/1484290 187.28% | |
89 | |
90 which translates to ~33c/pel to ~57c/pel for the reference vs 17.5 | |
91 c/pel for the optimized implementations. Not sure why there is such a | |
92 huge variation on the reference codes on Blackfin I guess it must have | |
93 to do with the memory system. | |
94 | |
95 */ | |
96 | |
97 #define mL1 .l1.text | |
98 #define mL3 .text | |
99 #define MEM mL1 | |
100 | |
101 #define DEFUN(fname,where,interface) \ | |
102 .section where; \ | |
103 .global _ff_bfin_ ## fname; \ | |
104 .type _ff_bfin_ ## fname, STT_FUNC; \ | |
105 .align 8; \ | |
106 _ff_bfin_ ## fname | |
107 | |
108 #define DEFUN_END(fname) \ | |
109 .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname | |
110 | |
111 | |
112 .text | |
113 | |
114 #define COEFF_LEN 11*4 | |
115 #define COEFF_REL_CY_OFF 4*4 | |
116 | |
117 #define ARG_OUT 20 | |
118 #define ARG_W 24 | |
119 #define ARG_COEFF 28 | |
120 | |
121 DEFUN(yuv2rgb565_line,MEM, | |
122 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): | |
123 link 0; | |
124 [--sp] = (r7:4); | |
125 p1 = [fp+ARG_OUT]; | |
126 r3 = [fp+ARG_W]; | |
127 | |
128 i0 = r0; | |
129 i2 = r1; | |
130 i3 = r2; | |
131 | |
132 r0 = [fp+ARG_COEFF]; | |
133 i1 = r0; | |
134 b1 = i1; | |
135 l1 = COEFF_LEN; | |
136 m0 = COEFF_REL_CY_OFF; | |
137 p0 = r3; | |
138 | |
139 r0 = [i0++]; // 2Y | |
140 r1.l = w[i2++]; // 2u | |
141 r1.h = w[i3++]; // 2v | |
142 p0 = p0>>2; | |
143 | |
144 lsetup (.L0565, .L1565) lc0 = p0; | |
145 | |
146 /* | |
147 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv | |
148 r0 -- used to load 4ys | |
149 r1 -- used to load 2us,2vs | |
150 r4 -- y3,y2 | |
151 r5 -- y1,y0 | |
152 r6 -- u1,u0 | |
153 r7 -- v1,v0 | |
154 */ | |
155 r2=[i1++]; // oy | |
156 .L0565: | |
157 /* | |
158 rrrrrrrr gggggggg bbbbbbbb | |
159 5432109876543210 | |
160 bbbbb >>3 | |
161 gggggggg <<3 | |
162 rrrrrrrr <<8 | |
163 rrrrrggggggbbbbb | |
164 */ | |
165 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc | |
166 (r7,r6) = byteop16m (r1:0, r3:2) (r); | |
167 r5 = r5 << 2 (v); // y1,y0 | |
168 r4 = r4 << 2 (v); // y3,y2 | |
169 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero | |
170 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy | |
171 /* Y' = y*cy */ | |
172 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv | |
173 | |
174 /* R = Y+ crv*(Cr-128) */ | |
175 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); | |
176 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask | |
177 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu | |
178 r2 = r2 >> 3 (v); | |
179 r3 = r2 & r5; | |
180 | |
181 /* B = Y+ cbu*(Cb-128) */ | |
182 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); | |
183 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask | |
184 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu | |
185 r2 = r2 << 8 (v); | |
186 r2 = r2 & r5; | |
187 r3 = r3 | r2; | |
188 | |
189 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ | |
190 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv | |
191 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); | |
192 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask | |
193 r2 = r2 << 3 (v); | |
194 r2 = r2 & r5; | |
195 r3 = r3 | r2; | |
196 [p1++]=r3 || r1=[i1++]; // cy | |
197 | |
198 /* Y' = y*cy */ | |
199 | |
200 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv | |
201 | |
202 /* R = Y+ crv*(Cr-128) */ | |
203 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); | |
204 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask | |
205 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu | |
206 r2 = r2 >> 3 (v); | |
207 r3 = r2 & r5; | |
208 | |
209 /* B = Y+ cbu*(Cb-128) */ | |
210 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); | |
211 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask | |
212 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu | |
213 r2 = r2 << 8 (v); | |
214 r2 = r2 & r5; | |
215 r3 = r3 | r2; | |
216 | |
217 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ | |
218 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv | |
219 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask | |
220 r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y | |
221 r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u | |
222 r2 = r2 & r5; | |
223 r3 = r3 | r2; | |
224 [p1++]=r3 || r1.h = w[i3++]; // 2v | |
225 .L1565: r2=[i1++]; // oy | |
226 | |
227 l1 = 0; | |
228 | |
229 (r7:4) = [sp++]; | |
230 unlink; | |
231 rts; | |
232 DEFUN_END(yuv2rgb565_line) | |
233 | |
234 DEFUN(yuv2rgb555_line,MEM, | |
235 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): | |
236 link 0; | |
237 [--sp] = (r7:4); | |
238 p1 = [fp+ARG_OUT]; | |
239 r3 = [fp+ARG_W]; | |
240 | |
241 i0 = r0; | |
242 i2 = r1; | |
243 i3 = r2; | |
244 | |
245 r0 = [fp+ARG_COEFF]; | |
246 i1 = r0; | |
247 b1 = i1; | |
248 l1 = COEFF_LEN; | |
249 m0 = COEFF_REL_CY_OFF; | |
250 p0 = r3; | |
251 | |
252 r0 = [i0++]; // 2Y | |
253 r1.l = w[i2++]; // 2u | |
254 r1.h = w[i3++]; // 2v | |
255 p0 = p0>>2; | |
256 | |
257 lsetup (.L0555, .L1555) lc0 = p0; | |
258 | |
259 /* | |
260 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv | |
261 r0 -- used to load 4ys | |
262 r1 -- used to load 2us,2vs | |
263 r4 -- y3,y2 | |
264 r5 -- y1,y0 | |
265 r6 -- u1,u0 | |
266 r7 -- v1,v0 | |
267 */ | |
268 r2=[i1++]; // oy | |
269 .L0555: | |
270 /* | |
271 rrrrrrrr gggggggg bbbbbbbb | |
272 5432109876543210 | |
273 bbbbb >>3 | |
274 gggggggg <<2 | |
275 rrrrrrrr <<7 | |
276 xrrrrrgggggbbbbb | |
277 */ | |
278 | |
279 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc | |
280 (r7,r6) = byteop16m (r1:0, r3:2) (r); | |
281 r5 = r5 << 2 (v); // y1,y0 | |
282 r4 = r4 << 2 (v); // y3,y2 | |
283 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero | |
284 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy | |
285 /* Y' = y*cy */ | |
286 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv | |
287 | |
288 /* R = Y+ crv*(Cr-128) */ | |
289 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); | |
290 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask | |
291 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu | |
292 r2 = r2 >> 3 (v); | |
293 r3 = r2 & r5; | |
294 | |
295 /* B = Y+ cbu*(Cb-128) */ | |
296 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); | |
297 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask | |
298 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu | |
299 r2 = r2 << 7 (v); | |
300 r2 = r2 & r5; | |
301 r3 = r3 | r2; | |
302 | |
303 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ | |
304 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv | |
305 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); | |
306 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask | |
307 r2 = r2 << 2 (v); | |
308 r2 = r2 & r5; | |
309 r3 = r3 | r2; | |
310 [p1++]=r3 || r1=[i1++]; // cy | |
311 | |
312 /* Y' = y*cy */ | |
313 | |
314 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv | |
315 | |
316 /* R = Y+ crv*(Cr-128) */ | |
317 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); | |
318 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask | |
319 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu | |
320 r2 = r2 >> 3 (v); | |
321 r3 = r2 & r5; | |
322 | |
323 /* B = Y+ cbu*(Cb-128) */ | |
324 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); | |
325 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask | |
326 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu | |
327 r2 = r2 << 7 (v); | |
328 r2 = r2 & r5; | |
329 r3 = r3 | r2; | |
330 | |
331 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ | |
332 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv | |
333 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask | |
334 r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y | |
335 r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u | |
336 r2 = r2 & r5; | |
337 r3 = r3 | r2; | |
338 [p1++]=r3 || r1.h=w[i3++]; // 2v | |
339 | |
340 .L1555: r2=[i1++]; // oy | |
341 | |
342 l1 = 0; | |
343 | |
344 (r7:4) = [sp++]; | |
345 unlink; | |
346 rts; | |
347 DEFUN_END(yuv2rgb555_line) | |
348 | |
349 DEFUN(yuv2rgb24_line,MEM, | |
350 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): | |
351 link 0; | |
352 [--sp] = (r7:4); | |
353 p1 = [fp+ARG_OUT]; | |
354 r3 = [fp+ARG_W]; | |
355 p2 = p1; | |
356 p2 += 3; | |
357 | |
358 i0 = r0; | |
359 i2 = r1; | |
360 i3 = r2; | |
361 | |
362 r0 = [fp+ARG_COEFF]; // coeff buffer | |
363 i1 = r0; | |
364 b1 = i1; | |
365 l1 = COEFF_LEN; | |
366 m0 = COEFF_REL_CY_OFF; | |
367 p0 = r3; | |
368 | |
369 r0 = [i0++]; // 2Y | |
370 r1.l = w[i2++]; // 2u | |
371 r1.h = w[i3++]; // 2v | |
372 p0 = p0>>2; | |
373 | |
374 lsetup (.L0888, .L1888) lc0 = p0; | |
375 | |
376 /* | |
377 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv | |
378 r0 -- used to load 4ys | |
379 r1 -- used to load 2us,2vs | |
380 r4 -- y3,y2 | |
381 r5 -- y1,y0 | |
382 r6 -- u1,u0 | |
383 r7 -- v1,v0 | |
384 */ | |
385 r2=[i1++]; // oy | |
386 .L0888: | |
387 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc | |
388 (r7,r6) = byteop16m (r1:0, r3:2) (r); | |
389 r5 = r5 << 2 (v); // y1,y0 | |
390 r4 = r4 << 2 (v); // y3,y2 | |
391 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero | |
392 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy | |
393 | |
394 /* Y' = y*cy */ | |
395 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv | |
396 | |
397 /* R = Y+ crv*(Cr-128) */ | |
398 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); | |
399 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask | |
400 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu | |
401 r2=r2>>16 || B[p1++]=r2; | |
402 B[p2++]=r2; | |
403 | |
404 /* B = Y+ cbu*(Cb-128) */ | |
405 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); | |
406 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask | |
407 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu | |
408 | |
409 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ | |
410 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv | |
411 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); | |
412 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero | |
413 | |
414 r2=r2>>16 || B[p1++]=r2; | |
415 B[p2++]=r2; | |
416 | |
417 r3=r3>>16 || B[p1++]=r3; | |
418 B[p2++]=r3 || r1=[i1++]; // cy | |
419 | |
420 p1+=3; | |
421 p2+=3; | |
422 /* Y' = y*cy */ | |
423 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv | |
424 | |
425 /* R = Y+ crv*(Cr-128) */ | |
426 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); | |
427 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask | |
428 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu | |
429 r2=r2>>16 || B[p1++]=r2; | |
430 B[p2++]=r2; | |
431 | |
432 /* B = Y+ cbu*(Cb-128) */ | |
433 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); | |
434 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask | |
435 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu | |
436 | |
437 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ | |
438 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv | |
439 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); | |
440 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask | |
441 r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y | |
442 B[p2++]=r2 || r1.l = w[i2++]; // 2u | |
443 r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v | |
444 B[p2++]=r3 || r2=[i1++]; // oy | |
445 | |
446 p1+=3; | |
447 .L1888: p2+=3; | |
448 | |
449 l1 = 0; | |
450 | |
451 (r7:4) = [sp++]; | |
452 unlink; | |
453 rts; | |
454 DEFUN_END(yuv2rgb888_line) |