comparison libswscale/x86/yuv2rgb_template2.c @ 31123:b3c85aa7adbf

40% faster yuv420 to rgb24 mmx. It is now faster than the old gpl version on conroe.
author lorenm
date Wed, 19 May 2010 08:21:56 +0000
parents 6502a6b24f9b
children b11e3ae960ac
comparison
equal deleted inserted replaced
31122:453095f84e98 31123:b3c85aa7adbf
122 "paddsw %%mm1, %%mm5\n\t" \ 122 "paddsw %%mm1, %%mm5\n\t" \
123 "paddsw %%mm2, %%mm7\n\t" \ 123 "paddsw %%mm2, %%mm7\n\t" \
124 "paddsw %%mm6, %%mm0\n\t" \ 124 "paddsw %%mm6, %%mm0\n\t" \
125 "paddsw %%mm6, %%mm1\n\t" \ 125 "paddsw %%mm6, %%mm1\n\t" \
126 "paddsw %%mm6, %%mm2\n\t" \ 126 "paddsw %%mm6, %%mm2\n\t" \
127 \ 127
128 #define RGB_PACK_INTERLEAVE \
128 /* pack and interleave even/odd pixels */ \ 129 /* pack and interleave even/odd pixels */ \
129 "packuswb %%mm0, %%mm0\n\t" \ 130 "packuswb %%mm1, %%mm0\n\t" \
130 "packuswb %%mm1, %%mm1\n\t" \ 131 "packuswb %%mm5, %%mm3\n\t" \
131 "packuswb %%mm2, %%mm2\n\t" \ 132 "packuswb %%mm2, %%mm2\n\t" \
132 "packuswb %%mm3, %%mm3\n\t" \ 133 "movq %%mm0, %%mm1\n\n" \
133 "packuswb %%mm5, %%mm5\n\t" \
134 "packuswb %%mm7, %%mm7\n\t" \ 134 "packuswb %%mm7, %%mm7\n\t" \
135 "punpcklbw %%mm3, %%mm0\n\t" \ 135 "punpcklbw %%mm3, %%mm0\n\t" \
136 "punpcklbw %%mm5, %%mm1\n\t" \ 136 "punpckhbw %%mm3, %%mm1\n\t" \
137 "punpcklbw %%mm7, %%mm2\n\t" \ 137 "punpcklbw %%mm7, %%mm2\n\t" \
138 138
139 #define YUV2RGB_ENDLOOP(depth) \ 139 #define YUV2RGB_ENDLOOP(depth) \
140 "movq 8 (%5, %0, 2), %%mm6\n\t" \ 140 "movq 8 (%5, %0, 2), %%mm6\n\t" \
141 "movd 4 (%3, %0), %%mm1\n\t" \ 141 "movd 4 (%3, %0), %%mm1\n\t" \
208 c->redDither = ff_dither8[(y + 1) & 1]; 208 c->redDither = ff_dither8[(y + 1) & 1];
209 #endif 209 #endif
210 210
211 YUV2RGB_INITIAL_LOAD 211 YUV2RGB_INITIAL_LOAD
212 YUV2RGB 212 YUV2RGB
213 RGB_PACK_INTERLEAVE
213 #ifdef DITHER1XBPP 214 #ifdef DITHER1XBPP
214 DITHER_RGB 215 DITHER_RGB
215 #endif 216 #endif
216 RGB_PACK16(mmx_redmask, "2", "7") 217 RGB_PACK16(mmx_redmask, "2", "7")
217 218
235 c->redDither = ff_dither8[(y + 1) & 1]; 236 c->redDither = ff_dither8[(y + 1) & 1];
236 #endif 237 #endif
237 238
238 YUV2RGB_INITIAL_LOAD 239 YUV2RGB_INITIAL_LOAD
239 YUV2RGB 240 YUV2RGB
241 RGB_PACK_INTERLEAVE
240 #ifdef DITHER1XBPP 242 #ifdef DITHER1XBPP
241 DITHER_RGB 243 DITHER_RGB
242 #endif 244 #endif
243 RGB_PACK16(mmx_grnmask, "3", "8") 245 RGB_PACK16(mmx_grnmask, "3", "8")
244 246
245 YUV2RGB_ENDLOOP(2) 247 YUV2RGB_ENDLOOP(2)
246 YUV2RGB_OPERANDS 248 YUV2RGB_OPERANDS
247 YUV2RGB_ENDFUNC 249 YUV2RGB_ENDFUNC
248 } 250 }
249 251
250 252 #define RGB_PACK24(blue, red)\
251 #define RGB_PACK24(red, blue) \ 253 "packuswb %%mm3, %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\
252 /* generate first packed RGB octet */ \ 254 "packuswb %%mm5, %%mm1 \n" /* B0 B2 B4 B6 B1 B3 B5 B7 */\
253 "movq %%mm2, %%mm5\n\t" \ 255 "packuswb %%mm7, %%mm2 \n" /* G0 G2 G4 G6 G1 G3 G5 G7 */\
254 "movq %%mm"blue", %%mm6\n\t" \ 256 "movq %%mm"red", %%mm3 \n"\
255 "movq %%mm"red", %%mm7\n\t" \ 257 "movq %%mm"blue", %%mm6 \n"\
256 "punpcklbw %%mm5, %%mm6\n\t" \ 258 "psrlq $32, %%mm"red" \n" /* R1 R3 R5 R7 */\
257 "punpcklbw %%mm4, %%mm7\n\t" \ 259 "punpcklbw %%mm2, %%mm3 \n" /* R0 G0 R2 G2 R4 G4 R6 G6 */\
258 "movq %%mm6, %%mm3\n\t" \ 260 "punpcklbw %%mm"red", %%mm6 \n" /* B0 R1 B2 R3 B4 R5 B6 R7 */\
259 "punpcklwd %%mm7, %%mm6\n\t" \ 261 "movq %%mm3, %%mm5 \n"\
260 "psrlq $32, %%mm3\n\t" \ 262 "punpckhbw %%mm"blue", %%mm2 \n" /* G1 B1 G3 B3 G5 B5 G7 B7 */\
261 "movq %%mm6, %%mm5\n\t" \ 263 "punpcklwd %%mm6, %%mm3 \n" /* R0 G0 B0 R1 R2 G2 B2 R3 */\
262 "psllq $40, %%mm6\n\t" \ 264 "punpckhwd %%mm6, %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\
263 "psllq $48, %%mm3\n\t" \ 265 RGB_PACK24_B
264 "psrlq $32, %%mm5\n\t" \ 266
265 "psrlq $40, %%mm6\n\t" \ 267 #if HAVE_MMX2
266 "psllq $24, %%mm5\n\t" \ 268 DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1};
267 "por %%mm3, %%mm6\n\t" \ 269 DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
268 "por %%mm5, %%mm6\n\t" \ 270 DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
269 MOVNTQ " %%mm6, (%1)\n\t" \ 271 DECLARE_ASM_CONST(8, int16_t, mask1001[4]) = {-1, 0, 0,-1};
270 \ 272 DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0};
271 /* generate second packed RGB octet */ \ 273 #undef RGB_PACK24_B
272 "movq %%mm"red", %%mm7\n\t" \ 274 #define RGB_PACK24_B\
273 "movq %%mm2, %%mm5\n\t" \ 275 "pshufw $0xc6, %%mm2, %%mm1 \n"\
274 "movq %%mm"blue", %%mm6\n\t" \ 276 "pshufw $0x84, %%mm3, %%mm6 \n"\
275 "punpcklbw %%mm4, %%mm7\n\t" \ 277 "pshufw $0x38, %%mm5, %%mm7 \n"\
276 "punpcklbw %%mm5, %%mm6\n\t" \ 278 "pand "MANGLE(mask1101)", %%mm6 \n" /* R0 G0 B0 R1 -- -- R2 G2 */\
277 "movq %%mm7, %%mm3\n\t" \ 279 "movq %%mm1, %%mm0 \n"\
278 "punpckhwd %%mm7, %%mm6\n\t" \ 280 "pand "MANGLE(mask0110)", %%mm7 \n" /* -- -- R6 G6 B6 R7 -- -- */\
279 "psllq $16, %%mm3\n\t" \ 281 "movq %%mm1, %%mm2 \n"\
280 "psrlq $32, %%mm6\n\t" \ 282 "pand "MANGLE(mask0100)", %%mm1 \n" /* -- -- G3 B3 -- -- -- -- */\
281 "psrlq $48, %%mm3\n\t" \ 283 "psrlq $48, %%mm3 \n" /* B2 R3 -- -- -- -- -- -- */\
282 "psllq $8, %%mm6\n\t" \ 284 "pand "MANGLE(mask0010)", %%mm0 \n" /* -- -- -- -- G1 B1 -- -- */\
283 "movq %%mm"red", %%mm7\n\t" \ 285 "psllq $32, %%mm5 \n" /* -- -- -- -- R4 G4 B4 R5 */\
284 "por %%mm6, %%mm3\n\t" \ 286 "pand "MANGLE(mask1001)", %%mm2 \n" /* G5 B5 -- -- -- -- G7 B7 */\
285 "movq %%mm"blue", %%mm6\n\t" \ 287 "por %%mm3, %%mm1 \n"\
286 "movq %%mm2, %%mm5\n\t" \ 288 "por %%mm6, %%mm0 \n"\
287 "punpckhbw %%mm4, %%mm7\n\t" \ 289 "por %%mm5, %%mm1 \n"\
288 "punpckhbw %%mm5, %%mm6\n\t" \ 290 "por %%mm7, %%mm2 \n"\
289 "movq %%mm6, %%mm5\n\t" \ 291 MOVNTQ" %%mm0, (%1) \n"\
290 "punpcklwd %%mm7, %%mm6\n\t" \ 292 MOVNTQ" %%mm1, 8(%1) \n"\
291 "psrlq $16, %%mm5\n\t" \ 293 MOVNTQ" %%mm2, 16(%1) \n"\
292 "psllq $56, %%mm5\n\t" \ 294
293 "por %%mm5, %%mm3\n\t" \ 295 #else
294 "psllq $32, %%mm6\n\t" \ 296 #undef RGB_PACK24_B
295 "por %%mm6, %%mm3\n\t" \ 297 #define RGB_PACK24_B\
296 MOVNTQ " %%mm3, 8(%1)\n\t" \ 298 "movd %%mm3, (%1) \n" /* R0 G0 B0 R1 */\
297 \ 299 "movd %%mm2, 4(%1) \n" /* G1 B1 */\
298 /* generate third packed RGB octet */ \ 300 "psrlq $32, %%mm3 \n"\
299 "movq %%mm"red", %%mm7\n\t" \ 301 "psrlq $16, %%mm2 \n"\
300 "movq %%mm2, %%mm5\n\t" \ 302 "movd %%mm3, 6(%1) \n" /* R2 G2 B2 R3 */\
301 "movq %%mm2, %%mm3\n\t" \ 303 "movd %%mm2, 10(%1) \n" /* G3 B3 */\
302 "movq %%mm"blue", %%mm6\n\t" \ 304 "psrlq $16, %%mm2 \n"\
303 "punpckhbw %%mm"red", %%mm3\n\t" \ 305 "movd %%mm5, 12(%1) \n" /* R4 G4 B4 R5 */\
304 "punpckhbw %%mm4, %%mm7\n\t" \ 306 "movd %%mm2, 16(%1) \n" /* G5 B5 */\
305 "psllq $32, %%mm3\n\t" \ 307 "psrlq $32, %%mm5 \n"\
306 "punpckhbw %%mm5, %%mm6\n\t" \ 308 "movd %%mm2, 20(%1) \n" /* -- -- G7 B7 */\
307 "psrlq $48, %%mm3\n\t" \ 309 "movd %%mm5, 18(%1) \n" /* R6 G6 B6 R7 */\
308 "punpckhwd %%mm7, %%mm6\n\t" \ 310
309 "movq %%mm6, %%mm7\n\t" \ 311 #endif
310 "psrlq $32, %%mm6\n\t" \
311 "psllq $32, %%mm7\n\t" \
312 "psllq $40, %%mm6\n\t" \
313 "psrlq $16, %%mm7\n\t" \
314 "por %%mm6, %%mm3\n\t" \
315 "por %%mm7, %%mm3\n\t" \
316 MOVNTQ " %%mm3, 16(%1)\n\t" \
317 312
318 static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[], 313 static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
319 int srcStride[], 314 int srcStride[],
320 int srcSliceY, int srcSliceH, 315 int srcSliceY, int srcSliceH,
321 uint8_t *dst[], int dstStride[]) 316 uint8_t *dst[], int dstStride[])
385 380
386 YUV2RGB_LOOP(4) 381 YUV2RGB_LOOP(4)
387 382
388 YUV2RGB_INITIAL_LOAD 383 YUV2RGB_INITIAL_LOAD
389 YUV2RGB 384 YUV2RGB
385 RGB_PACK_INTERLEAVE
390 SET_EMPTY_ALPHA 386 SET_EMPTY_ALPHA
391 RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA) 387 RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
392 388
393 YUV2RGB_ENDLOOP(4) 389 YUV2RGB_ENDLOOP(4)
394 YUV2RGB_OPERANDS 390 YUV2RGB_OPERANDS
406 YUV2RGB_LOOP(4) 402 YUV2RGB_LOOP(4)
407 403
408 const uint8_t *pa = src[3] + y * srcStride[3]; 404 const uint8_t *pa = src[3] + y * srcStride[3];
409 YUV2RGB_INITIAL_LOAD 405 YUV2RGB_INITIAL_LOAD
410 YUV2RGB 406 YUV2RGB
407 RGB_PACK_INTERLEAVE
411 LOAD_ALPHA 408 LOAD_ALPHA
412 RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA) 409 RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
413 410
414 YUV2RGB_ENDLOOP(4) 411 YUV2RGB_ENDLOOP(4)
415 YUV2RGB_OPERANDS_ALPHA 412 YUV2RGB_OPERANDS_ALPHA
426 423
427 YUV2RGB_LOOP(4) 424 YUV2RGB_LOOP(4)
428 425
429 YUV2RGB_INITIAL_LOAD 426 YUV2RGB_INITIAL_LOAD
430 YUV2RGB 427 YUV2RGB
428 RGB_PACK_INTERLEAVE
431 SET_EMPTY_ALPHA 429 SET_EMPTY_ALPHA
432 RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA) 430 RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
433 431
434 YUV2RGB_ENDLOOP(4) 432 YUV2RGB_ENDLOOP(4)
435 YUV2RGB_OPERANDS 433 YUV2RGB_OPERANDS
447 YUV2RGB_LOOP(4) 445 YUV2RGB_LOOP(4)
448 446
449 const uint8_t *pa = src[3] + y * srcStride[3]; 447 const uint8_t *pa = src[3] + y * srcStride[3];
450 YUV2RGB_INITIAL_LOAD 448 YUV2RGB_INITIAL_LOAD
451 YUV2RGB 449 YUV2RGB
450 RGB_PACK_INTERLEAVE
452 LOAD_ALPHA 451 LOAD_ALPHA
453 RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA) 452 RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
454 453
455 YUV2RGB_ENDLOOP(4) 454 YUV2RGB_ENDLOOP(4)
456 YUV2RGB_OPERANDS_ALPHA 455 YUV2RGB_OPERANDS_ALPHA