Mercurial > mplayer.hg
comparison libswscale/rgb2rgb_template.c @ 22991:59671a52cc82
New implementation of rgb32tobgr32
The previous implementation segfaulted with MMX enabled when fed an image
smaller than the size of the units the MMX code processed. The new code:
- is faster for MMX, MMX2 and plain C
- processes small images correctly
- is LGPL
author | ivo |
---|---|
date | Mon, 16 Apr 2007 21:41:03 +0000 |
parents | 2d1ad4285df4 |
children | ac77d9ef8c83 |
comparison
equal
deleted
inserted
replaced
22990:a76748e71a18 | 22991:59671a52cc82 |
---|---|
1362 } | 1362 } |
1363 } | 1363 } |
1364 | 1364 |
1365 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size) | 1365 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size) |
1366 { | 1366 { |
1367 #ifdef HAVE_MMX | 1367 uint8_t *d = dst, *s = (uint8_t *) src; |
1368 /* TODO: unroll this loop */ | 1368 const uint8_t *end = s + src_size; |
1369 asm volatile ( | 1369 #ifdef HAVE_MMX |
1370 "xor %%"REG_a", %%"REG_a" \n\t" | 1370 __asm __volatile( |
1371 ASMALIGN(4) | 1371 " "PREFETCH" (%1) \n" |
1372 "1: \n\t" | 1372 " movq %3, %%mm7 \n" |
1373 PREFETCH" 32(%0, %%"REG_a") \n\t" | 1373 " pxor %4, %%mm7 \n" |
1374 "movq (%0, %%"REG_a"), %%mm0 \n\t" | 1374 " movq %%mm7, %%mm6 \n" |
1375 "movq %%mm0, %%mm1 \n\t" | 1375 " pxor %5, %%mm7 \n" |
1376 "movq %%mm0, %%mm2 \n\t" | 1376 " jmp 2f \n" |
1377 "pslld $16, %%mm0 \n\t" | 1377 ASMALIGN(4) |
1378 "psrld $16, %%mm1 \n\t" | 1378 "1: \n" |
1379 "pand "MANGLE(mask32r)", %%mm0 \n\t" | 1379 " "PREFETCH" 32(%1) \n" |
1380 "pand "MANGLE(mask32g)", %%mm2 \n\t" | 1380 " movq (%1), %%mm0 \n" |
1381 "pand "MANGLE(mask32b)", %%mm1 \n\t" | 1381 " movq 8(%1), %%mm1 \n" |
1382 "por %%mm0, %%mm2 \n\t" | 1382 # ifdef HAVE_MMX2 |
1383 "por %%mm1, %%mm2 \n\t" | 1383 " pshufw $177, %%mm0, %%mm3 \n" |
1384 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t" | 1384 " pshufw $177, %%mm1, %%mm5 \n" |
1385 "add $8, %%"REG_a" \n\t" | 1385 " pand %%mm7, %%mm0 \n" |
1386 "cmp %2, %%"REG_a" \n\t" | 1386 " pand %%mm6, %%mm3 \n" |
1387 " jb 1b \n\t" | 1387 " pand %%mm7, %%mm1 \n" |
1388 :: "r" (src), "r"(dst), "r" (src_size-7) | 1388 " pand %%mm6, %%mm5 \n" |
1389 : "%"REG_a | 1389 " por %%mm3, %%mm0 \n" |
1390 ); | 1390 " por %%mm5, %%mm1 \n" |
1391 | 1391 # else |
1392 __asm __volatile(SFENCE:::"memory"); | 1392 " movq %%mm0, %%mm2 \n" |
1393 __asm __volatile(EMMS:::"memory"); | 1393 " movq %%mm1, %%mm4 \n" |
1394 #else | 1394 " pand %%mm7, %%mm0 \n" |
1395 unsigned i; | 1395 " pand %%mm6, %%mm2 \n" |
1396 unsigned num_pixels = src_size >> 2; | 1396 " pand %%mm7, %%mm1 \n" |
1397 for(i=0; i<num_pixels; i++) | 1397 " pand %%mm6, %%mm4 \n" |
1398 { | 1398 " movq %%mm2, %%mm3 \n" |
1399 #ifdef WORDS_BIGENDIAN | 1399 " movq %%mm4, %%mm5 \n" |
1400 dst[4*i + 1] = src[4*i + 3]; | 1400 " pslld $16, %%mm2 \n" |
1401 dst[4*i + 2] = src[4*i + 2]; | 1401 " psrld $16, %%mm3 \n" |
1402 dst[4*i + 3] = src[4*i + 1]; | 1402 " pslld $16, %%mm4 \n" |
1403 #else | 1403 " psrld $16, %%mm5 \n" |
1404 dst[4*i + 0] = src[4*i + 2]; | 1404 " por %%mm2, %%mm0 \n" |
1405 dst[4*i + 1] = src[4*i + 1]; | 1405 " por %%mm4, %%mm1 \n" |
1406 dst[4*i + 2] = src[4*i + 0]; | 1406 " por %%mm3, %%mm0 \n" |
1407 #endif | 1407 " por %%mm5, %%mm1 \n" |
1408 } | 1408 # endif |
1409 #endif | 1409 " "MOVNTQ" %%mm0, (%0) \n" |
1410 " "MOVNTQ" %%mm1, 8(%0) \n" | |
1411 " add $16, %0 \n" | |
1412 " add $16, %1 \n" | |
1413 "2: \n" | |
1414 " cmp %1, %2 \n" | |
1415 " ja 1b \n" | |
1416 " "SFENCE" \n" | |
1417 " "EMMS" \n" | |
1418 : "+r"(d), "+r"(s) | |
1419 : "r" (end-15), "m" (mask32b), "m" (mask32r), "m" (mmx_one) | |
1420 : "memory"); | |
1421 #endif | |
1422 for (; s<end; s+=4, d+=4) { | |
1423 int v = *(uint32_t *)s, g = v & 0xff00; | |
1424 v &= 0xff00ff; | |
1425 *(uint32_t *)d = (v>>16) + g + (v<<16); | |
1426 } | |
1410 } | 1427 } |
1411 | 1428 |
1412 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) | 1429 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) |
1413 { | 1430 { |
1414 unsigned i; | 1431 unsigned i; |