comparison libswscale/rgb2rgb_template.c @ 22991:59671a52cc82

New implementation of rgb32tobgr32 The previous implementation segfaulted with MMX enabled when fed an image smaller than the size of the units the MMX code processed. The new code: - is faster for MMX, MMX2 and plain C - processes small images correctly - is LGPL
author ivo
date Mon, 16 Apr 2007 21:41:03 +0000
parents 2d1ad4285df4
children ac77d9ef8c83
comparison
equal deleted inserted replaced
22990:a76748e71a18 22991:59671a52cc82
1362 } 1362 }
1363 } 1363 }
1364 1364
1365 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size) 1365 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1366 { 1366 {
1367 #ifdef HAVE_MMX 1367 uint8_t *d = dst, *s = (uint8_t *) src;
1368 /* TODO: unroll this loop */ 1368 const uint8_t *end = s + src_size;
1369 asm volatile ( 1369 #ifdef HAVE_MMX
1370 "xor %%"REG_a", %%"REG_a" \n\t" 1370 __asm __volatile(
1371 ASMALIGN(4) 1371 " "PREFETCH" (%1) \n"
1372 "1: \n\t" 1372 " movq %3, %%mm7 \n"
1373 PREFETCH" 32(%0, %%"REG_a") \n\t" 1373 " pxor %4, %%mm7 \n"
1374 "movq (%0, %%"REG_a"), %%mm0 \n\t" 1374 " movq %%mm7, %%mm6 \n"
1375 "movq %%mm0, %%mm1 \n\t" 1375 " pxor %5, %%mm7 \n"
1376 "movq %%mm0, %%mm2 \n\t" 1376 " jmp 2f \n"
1377 "pslld $16, %%mm0 \n\t" 1377 ASMALIGN(4)
1378 "psrld $16, %%mm1 \n\t" 1378 "1: \n"
1379 "pand "MANGLE(mask32r)", %%mm0 \n\t" 1379 " "PREFETCH" 32(%1) \n"
1380 "pand "MANGLE(mask32g)", %%mm2 \n\t" 1380 " movq (%1), %%mm0 \n"
1381 "pand "MANGLE(mask32b)", %%mm1 \n\t" 1381 " movq 8(%1), %%mm1 \n"
1382 "por %%mm0, %%mm2 \n\t" 1382 # ifdef HAVE_MMX2
1383 "por %%mm1, %%mm2 \n\t" 1383 " pshufw $177, %%mm0, %%mm3 \n"
1384 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t" 1384 " pshufw $177, %%mm1, %%mm5 \n"
1385 "add $8, %%"REG_a" \n\t" 1385 " pand %%mm7, %%mm0 \n"
1386 "cmp %2, %%"REG_a" \n\t" 1386 " pand %%mm6, %%mm3 \n"
1387 " jb 1b \n\t" 1387 " pand %%mm7, %%mm1 \n"
1388 :: "r" (src), "r"(dst), "r" (src_size-7) 1388 " pand %%mm6, %%mm5 \n"
1389 : "%"REG_a 1389 " por %%mm3, %%mm0 \n"
1390 ); 1390 " por %%mm5, %%mm1 \n"
1391 1391 # else
1392 __asm __volatile(SFENCE:::"memory"); 1392 " movq %%mm0, %%mm2 \n"
1393 __asm __volatile(EMMS:::"memory"); 1393 " movq %%mm1, %%mm4 \n"
1394 #else 1394 " pand %%mm7, %%mm0 \n"
1395 unsigned i; 1395 " pand %%mm6, %%mm2 \n"
1396 unsigned num_pixels = src_size >> 2; 1396 " pand %%mm7, %%mm1 \n"
1397 for(i=0; i<num_pixels; i++) 1397 " pand %%mm6, %%mm4 \n"
1398 { 1398 " movq %%mm2, %%mm3 \n"
1399 #ifdef WORDS_BIGENDIAN 1399 " movq %%mm4, %%mm5 \n"
1400 dst[4*i + 1] = src[4*i + 3]; 1400 " pslld $16, %%mm2 \n"
1401 dst[4*i + 2] = src[4*i + 2]; 1401 " psrld $16, %%mm3 \n"
1402 dst[4*i + 3] = src[4*i + 1]; 1402 " pslld $16, %%mm4 \n"
1403 #else 1403 " psrld $16, %%mm5 \n"
1404 dst[4*i + 0] = src[4*i + 2]; 1404 " por %%mm2, %%mm0 \n"
1405 dst[4*i + 1] = src[4*i + 1]; 1405 " por %%mm4, %%mm1 \n"
1406 dst[4*i + 2] = src[4*i + 0]; 1406 " por %%mm3, %%mm0 \n"
1407 #endif 1407 " por %%mm5, %%mm1 \n"
1408 } 1408 # endif
1409 #endif 1409 " "MOVNTQ" %%mm0, (%0) \n"
1410 " "MOVNTQ" %%mm1, 8(%0) \n"
1411 " add $16, %0 \n"
1412 " add $16, %1 \n"
1413 "2: \n"
1414 " cmp %1, %2 \n"
1415 " ja 1b \n"
1416 " "SFENCE" \n"
1417 " "EMMS" \n"
1418 : "+r"(d), "+r"(s)
1419 : "r" (end-15), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1420 : "memory");
1421 #endif
1422 for (; s<end; s+=4, d+=4) {
1423 int v = *(uint32_t *)s, g = v & 0xff00;
1424 v &= 0xff00ff;
1425 *(uint32_t *)d = (v>>16) + g + (v<<16);
1426 }
1410 } 1427 }
1411 1428
1412 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) 1429 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1413 { 1430 {
1414 unsigned i; 1431 unsigned i;