Mercurial > mplayer.hg
comparison libswscale/x86/yuv2rgb_template2.c @ 31123:b3c85aa7adbf
40% faster yuv420 to rgb24 mmx.
It is now faster than the old gpl version on conroe.
author | lorenm |
---|---|
date | Wed, 19 May 2010 08:21:56 +0000 |
parents | 6502a6b24f9b |
children | b11e3ae960ac |
comparison
equal
deleted
inserted
replaced
31122:453095f84e98 | 31123:b3c85aa7adbf |
---|---|
122 "paddsw %%mm1, %%mm5\n\t" \ | 122 "paddsw %%mm1, %%mm5\n\t" \ |
123 "paddsw %%mm2, %%mm7\n\t" \ | 123 "paddsw %%mm2, %%mm7\n\t" \ |
124 "paddsw %%mm6, %%mm0\n\t" \ | 124 "paddsw %%mm6, %%mm0\n\t" \ |
125 "paddsw %%mm6, %%mm1\n\t" \ | 125 "paddsw %%mm6, %%mm1\n\t" \ |
126 "paddsw %%mm6, %%mm2\n\t" \ | 126 "paddsw %%mm6, %%mm2\n\t" \ |
127 \ | 127 |
128 #define RGB_PACK_INTERLEAVE \ | |
128 /* pack and interleave even/odd pixels */ \ | 129 /* pack and interleave even/odd pixels */ \ |
129 "packuswb %%mm0, %%mm0\n\t" \ | 130 "packuswb %%mm1, %%mm0\n\t" \ |
130 "packuswb %%mm1, %%mm1\n\t" \ | 131 "packuswb %%mm5, %%mm3\n\t" \ |
131 "packuswb %%mm2, %%mm2\n\t" \ | 132 "packuswb %%mm2, %%mm2\n\t" \ |
132 "packuswb %%mm3, %%mm3\n\t" \ | 133 "movq %%mm0, %%mm1\n\n" \ |
133 "packuswb %%mm5, %%mm5\n\t" \ | |
134 "packuswb %%mm7, %%mm7\n\t" \ | 134 "packuswb %%mm7, %%mm7\n\t" \ |
135 "punpcklbw %%mm3, %%mm0\n\t" \ | 135 "punpcklbw %%mm3, %%mm0\n\t" \ |
136 "punpcklbw %%mm5, %%mm1\n\t" \ | 136 "punpckhbw %%mm3, %%mm1\n\t" \ |
137 "punpcklbw %%mm7, %%mm2\n\t" \ | 137 "punpcklbw %%mm7, %%mm2\n\t" \ |
138 | 138 |
139 #define YUV2RGB_ENDLOOP(depth) \ | 139 #define YUV2RGB_ENDLOOP(depth) \ |
140 "movq 8 (%5, %0, 2), %%mm6\n\t" \ | 140 "movq 8 (%5, %0, 2), %%mm6\n\t" \ |
141 "movd 4 (%3, %0), %%mm1\n\t" \ | 141 "movd 4 (%3, %0), %%mm1\n\t" \ |
208 c->redDither = ff_dither8[(y + 1) & 1]; | 208 c->redDither = ff_dither8[(y + 1) & 1]; |
209 #endif | 209 #endif |
210 | 210 |
211 YUV2RGB_INITIAL_LOAD | 211 YUV2RGB_INITIAL_LOAD |
212 YUV2RGB | 212 YUV2RGB |
213 RGB_PACK_INTERLEAVE | |
213 #ifdef DITHER1XBPP | 214 #ifdef DITHER1XBPP |
214 DITHER_RGB | 215 DITHER_RGB |
215 #endif | 216 #endif |
216 RGB_PACK16(mmx_redmask, "2", "7") | 217 RGB_PACK16(mmx_redmask, "2", "7") |
217 | 218 |
235 c->redDither = ff_dither8[(y + 1) & 1]; | 236 c->redDither = ff_dither8[(y + 1) & 1]; |
236 #endif | 237 #endif |
237 | 238 |
238 YUV2RGB_INITIAL_LOAD | 239 YUV2RGB_INITIAL_LOAD |
239 YUV2RGB | 240 YUV2RGB |
241 RGB_PACK_INTERLEAVE | |
240 #ifdef DITHER1XBPP | 242 #ifdef DITHER1XBPP |
241 DITHER_RGB | 243 DITHER_RGB |
242 #endif | 244 #endif |
243 RGB_PACK16(mmx_grnmask, "3", "8") | 245 RGB_PACK16(mmx_grnmask, "3", "8") |
244 | 246 |
245 YUV2RGB_ENDLOOP(2) | 247 YUV2RGB_ENDLOOP(2) |
246 YUV2RGB_OPERANDS | 248 YUV2RGB_OPERANDS |
247 YUV2RGB_ENDFUNC | 249 YUV2RGB_ENDFUNC |
248 } | 250 } |
249 | 251 |
250 | 252 #define RGB_PACK24(blue, red)\ |
251 #define RGB_PACK24(red, blue) \ | 253 "packuswb %%mm3, %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\ |
252 /* generate first packed RGB octet */ \ | 254 "packuswb %%mm5, %%mm1 \n" /* B0 B2 B4 B6 B1 B3 B5 B7 */\ |
253 "movq %%mm2, %%mm5\n\t" \ | 255 "packuswb %%mm7, %%mm2 \n" /* G0 G2 G4 G6 G1 G3 G5 G7 */\ |
254 "movq %%mm"blue", %%mm6\n\t" \ | 256 "movq %%mm"red", %%mm3 \n"\ |
255 "movq %%mm"red", %%mm7\n\t" \ | 257 "movq %%mm"blue", %%mm6 \n"\ |
256 "punpcklbw %%mm5, %%mm6\n\t" \ | 258 "psrlq $32, %%mm"red" \n" /* R1 R3 R5 R7 */\ |
257 "punpcklbw %%mm4, %%mm7\n\t" \ | 259 "punpcklbw %%mm2, %%mm3 \n" /* R0 G0 R2 G2 R4 G4 R6 G6 */\ |
258 "movq %%mm6, %%mm3\n\t" \ | 260 "punpcklbw %%mm"red", %%mm6 \n" /* B0 R1 B2 R3 B4 R5 B6 R7 */\ |
259 "punpcklwd %%mm7, %%mm6\n\t" \ | 261 "movq %%mm3, %%mm5 \n"\ |
260 "psrlq $32, %%mm3\n\t" \ | 262 "punpckhbw %%mm"blue", %%mm2 \n" /* G1 B1 G3 B3 G5 B5 G7 B7 */\ |
261 "movq %%mm6, %%mm5\n\t" \ | 263 "punpcklwd %%mm6, %%mm3 \n" /* R0 G0 B0 R1 R2 G2 B2 R3 */\ |
262 "psllq $40, %%mm6\n\t" \ | 264 "punpckhwd %%mm6, %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\ |
263 "psllq $48, %%mm3\n\t" \ | 265 RGB_PACK24_B |
264 "psrlq $32, %%mm5\n\t" \ | 266 |
265 "psrlq $40, %%mm6\n\t" \ | 267 #if HAVE_MMX2 |
266 "psllq $24, %%mm5\n\t" \ | 268 DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1}; |
267 "por %%mm3, %%mm6\n\t" \ | 269 DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0}; |
268 "por %%mm5, %%mm6\n\t" \ | 270 DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0}; |
269 MOVNTQ " %%mm6, (%1)\n\t" \ | 271 DECLARE_ASM_CONST(8, int16_t, mask1001[4]) = {-1, 0, 0,-1}; |
270 \ | 272 DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0}; |
271 /* generate second packed RGB octet */ \ | 273 #undef RGB_PACK24_B |
272 "movq %%mm"red", %%mm7\n\t" \ | 274 #define RGB_PACK24_B\ |
273 "movq %%mm2, %%mm5\n\t" \ | 275 "pshufw $0xc6, %%mm2, %%mm1 \n"\ |
274 "movq %%mm"blue", %%mm6\n\t" \ | 276 "pshufw $0x84, %%mm3, %%mm6 \n"\ |
275 "punpcklbw %%mm4, %%mm7\n\t" \ | 277 "pshufw $0x38, %%mm5, %%mm7 \n"\ |
276 "punpcklbw %%mm5, %%mm6\n\t" \ | 278 "pand "MANGLE(mask1101)", %%mm6 \n" /* R0 G0 B0 R1 -- -- R2 G2 */\ |
277 "movq %%mm7, %%mm3\n\t" \ | 279 "movq %%mm1, %%mm0 \n"\ |
278 "punpckhwd %%mm7, %%mm6\n\t" \ | 280 "pand "MANGLE(mask0110)", %%mm7 \n" /* -- -- R6 G6 B6 R7 -- -- */\ |
279 "psllq $16, %%mm3\n\t" \ | 281 "movq %%mm1, %%mm2 \n"\ |
280 "psrlq $32, %%mm6\n\t" \ | 282 "pand "MANGLE(mask0100)", %%mm1 \n" /* -- -- G3 B3 -- -- -- -- */\ |
281 "psrlq $48, %%mm3\n\t" \ | 283 "psrlq $48, %%mm3 \n" /* B2 R3 -- -- -- -- -- -- */\ |
282 "psllq $8, %%mm6\n\t" \ | 284 "pand "MANGLE(mask0010)", %%mm0 \n" /* -- -- -- -- G1 B1 -- -- */\ |
283 "movq %%mm"red", %%mm7\n\t" \ | 285 "psllq $32, %%mm5 \n" /* -- -- -- -- R4 G4 B4 R5 */\ |
284 "por %%mm6, %%mm3\n\t" \ | 286 "pand "MANGLE(mask1001)", %%mm2 \n" /* G5 B5 -- -- -- -- G7 B7 */\ |
285 "movq %%mm"blue", %%mm6\n\t" \ | 287 "por %%mm3, %%mm1 \n"\ |
286 "movq %%mm2, %%mm5\n\t" \ | 288 "por %%mm6, %%mm0 \n"\ |
287 "punpckhbw %%mm4, %%mm7\n\t" \ | 289 "por %%mm5, %%mm1 \n"\ |
288 "punpckhbw %%mm5, %%mm6\n\t" \ | 290 "por %%mm7, %%mm2 \n"\ |
289 "movq %%mm6, %%mm5\n\t" \ | 291 MOVNTQ" %%mm0, (%1) \n"\ |
290 "punpcklwd %%mm7, %%mm6\n\t" \ | 292 MOVNTQ" %%mm1, 8(%1) \n"\ |
291 "psrlq $16, %%mm5\n\t" \ | 293 MOVNTQ" %%mm2, 16(%1) \n"\ |
292 "psllq $56, %%mm5\n\t" \ | 294 |
293 "por %%mm5, %%mm3\n\t" \ | 295 #else |
294 "psllq $32, %%mm6\n\t" \ | 296 #undef RGB_PACK24_B |
295 "por %%mm6, %%mm3\n\t" \ | 297 #define RGB_PACK24_B\ |
296 MOVNTQ " %%mm3, 8(%1)\n\t" \ | 298 "movd %%mm3, (%1) \n" /* R0 G0 B0 R1 */\ |
297 \ | 299 "movd %%mm2, 4(%1) \n" /* G1 B1 */\ |
298 /* generate third packed RGB octet */ \ | 300 "psrlq $32, %%mm3 \n"\ |
299 "movq %%mm"red", %%mm7\n\t" \ | 301 "psrlq $16, %%mm2 \n"\ |
300 "movq %%mm2, %%mm5\n\t" \ | 302 "movd %%mm3, 6(%1) \n" /* R2 G2 B2 R3 */\ |
301 "movq %%mm2, %%mm3\n\t" \ | 303 "movd %%mm2, 10(%1) \n" /* G3 B3 */\ |
302 "movq %%mm"blue", %%mm6\n\t" \ | 304 "psrlq $16, %%mm2 \n"\ |
303 "punpckhbw %%mm"red", %%mm3\n\t" \ | 305 "movd %%mm5, 12(%1) \n" /* R4 G4 B4 R5 */\ |
304 "punpckhbw %%mm4, %%mm7\n\t" \ | 306 "movd %%mm2, 16(%1) \n" /* G5 B5 */\ |
305 "psllq $32, %%mm3\n\t" \ | 307 "psrlq $32, %%mm5 \n"\ |
306 "punpckhbw %%mm5, %%mm6\n\t" \ | 308 "movd %%mm2, 20(%1) \n" /* -- -- G7 B7 */\ |
307 "psrlq $48, %%mm3\n\t" \ | 309 "movd %%mm5, 18(%1) \n" /* R6 G6 B6 R7 */\ |
308 "punpckhwd %%mm7, %%mm6\n\t" \ | 310 |
309 "movq %%mm6, %%mm7\n\t" \ | 311 #endif |
310 "psrlq $32, %%mm6\n\t" \ | |
311 "psllq $32, %%mm7\n\t" \ | |
312 "psllq $40, %%mm6\n\t" \ | |
313 "psrlq $16, %%mm7\n\t" \ | |
314 "por %%mm6, %%mm3\n\t" \ | |
315 "por %%mm7, %%mm3\n\t" \ | |
316 MOVNTQ " %%mm3, 16(%1)\n\t" \ | |
317 | 312 |
318 static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[], | 313 static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[], |
319 int srcStride[], | 314 int srcStride[], |
320 int srcSliceY, int srcSliceH, | 315 int srcSliceY, int srcSliceH, |
321 uint8_t *dst[], int dstStride[]) | 316 uint8_t *dst[], int dstStride[]) |
385 | 380 |
386 YUV2RGB_LOOP(4) | 381 YUV2RGB_LOOP(4) |
387 | 382 |
388 YUV2RGB_INITIAL_LOAD | 383 YUV2RGB_INITIAL_LOAD |
389 YUV2RGB | 384 YUV2RGB |
385 RGB_PACK_INTERLEAVE | |
390 SET_EMPTY_ALPHA | 386 SET_EMPTY_ALPHA |
391 RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA) | 387 RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA) |
392 | 388 |
393 YUV2RGB_ENDLOOP(4) | 389 YUV2RGB_ENDLOOP(4) |
394 YUV2RGB_OPERANDS | 390 YUV2RGB_OPERANDS |
406 YUV2RGB_LOOP(4) | 402 YUV2RGB_LOOP(4) |
407 | 403 |
408 const uint8_t *pa = src[3] + y * srcStride[3]; | 404 const uint8_t *pa = src[3] + y * srcStride[3]; |
409 YUV2RGB_INITIAL_LOAD | 405 YUV2RGB_INITIAL_LOAD |
410 YUV2RGB | 406 YUV2RGB |
407 RGB_PACK_INTERLEAVE | |
411 LOAD_ALPHA | 408 LOAD_ALPHA |
412 RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA) | 409 RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA) |
413 | 410 |
414 YUV2RGB_ENDLOOP(4) | 411 YUV2RGB_ENDLOOP(4) |
415 YUV2RGB_OPERANDS_ALPHA | 412 YUV2RGB_OPERANDS_ALPHA |
426 | 423 |
427 YUV2RGB_LOOP(4) | 424 YUV2RGB_LOOP(4) |
428 | 425 |
429 YUV2RGB_INITIAL_LOAD | 426 YUV2RGB_INITIAL_LOAD |
430 YUV2RGB | 427 YUV2RGB |
428 RGB_PACK_INTERLEAVE | |
431 SET_EMPTY_ALPHA | 429 SET_EMPTY_ALPHA |
432 RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA) | 430 RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA) |
433 | 431 |
434 YUV2RGB_ENDLOOP(4) | 432 YUV2RGB_ENDLOOP(4) |
435 YUV2RGB_OPERANDS | 433 YUV2RGB_OPERANDS |
447 YUV2RGB_LOOP(4) | 445 YUV2RGB_LOOP(4) |
448 | 446 |
449 const uint8_t *pa = src[3] + y * srcStride[3]; | 447 const uint8_t *pa = src[3] + y * srcStride[3]; |
450 YUV2RGB_INITIAL_LOAD | 448 YUV2RGB_INITIAL_LOAD |
451 YUV2RGB | 449 YUV2RGB |
450 RGB_PACK_INTERLEAVE | |
452 LOAD_ALPHA | 451 LOAD_ALPHA |
453 RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA) | 452 RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA) |
454 | 453 |
455 YUV2RGB_ENDLOOP(4) | 454 YUV2RGB_ENDLOOP(4) |
456 YUV2RGB_OPERANDS_ALPHA | 455 YUV2RGB_OPERANDS_ALPHA |