comparison postproc/rgb2rgb_template.c @ 13720:821f464b4d90

adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
author aurel
date Thu, 21 Oct 2004 11:55:20 +0000
parents 7f5ea5da1765
children 49dd10a86b23
comparison
equal deleted inserted replaced
13719:43ecd6a73ec0 13720:821f464b4d90
347 "por %%mm4, %%mm3 \n\t" 347 "por %%mm4, %%mm3 \n\t"
348 "psrld $5, %%mm0 \n\t" 348 "psrld $5, %%mm0 \n\t"
349 "pslld $11, %%mm3 \n\t" 349 "pslld $11, %%mm3 \n\t"
350 "por %%mm3, %%mm0 \n\t" 350 "por %%mm3, %%mm0 \n\t"
351 MOVNTQ" %%mm0, (%0) \n\t" 351 MOVNTQ" %%mm0, (%0) \n\t"
352 "addl $16, %1 \n\t" 352 "add $16, %1 \n\t"
353 "addl $8, %0 \n\t" 353 "add $8, %0 \n\t"
354 "cmpl %2, %1 \n\t" 354 "cmp %2, %1 \n\t"
355 " jb 1b \n\t" 355 " jb 1b \n\t"
356 : "+r" (d), "+r"(s) 356 : "+r" (d), "+r"(s)
357 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) 357 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
358 ); 358 );
359 #else 359 #else
507 "por %%mm4, %%mm3 \n\t" 507 "por %%mm4, %%mm3 \n\t"
508 "psrld $6, %%mm0 \n\t" 508 "psrld $6, %%mm0 \n\t"
509 "pslld $10, %%mm3 \n\t" 509 "pslld $10, %%mm3 \n\t"
510 "por %%mm3, %%mm0 \n\t" 510 "por %%mm3, %%mm0 \n\t"
511 MOVNTQ" %%mm0, (%0) \n\t" 511 MOVNTQ" %%mm0, (%0) \n\t"
512 "addl $16, %1 \n\t" 512 "add $16, %1 \n\t"
513 "addl $8, %0 \n\t" 513 "add $8, %0 \n\t"
514 "cmpl %2, %1 \n\t" 514 "cmp %2, %1 \n\t"
515 " jb 1b \n\t" 515 " jb 1b \n\t"
516 : "+r" (d), "+r"(s) 516 : "+r" (d), "+r"(s)
517 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) 517 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
518 ); 518 );
519 #else 519 #else
1343 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size) 1343 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1344 { 1344 {
1345 #ifdef HAVE_MMX 1345 #ifdef HAVE_MMX
1346 /* TODO: unroll this loop */ 1346 /* TODO: unroll this loop */
1347 asm volatile ( 1347 asm volatile (
1348 "xorl %%eax, %%eax \n\t" 1348 "xor %%"REG_a", %%"REG_a" \n\t"
1349 ".balign 16 \n\t" 1349 ".balign 16 \n\t"
1350 "1: \n\t" 1350 "1: \n\t"
1351 PREFETCH" 32(%0, %%eax) \n\t" 1351 PREFETCH" 32(%0, %%"REG_a") \n\t"
1352 "movq (%0, %%eax), %%mm0 \n\t" 1352 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1353 "movq %%mm0, %%mm1 \n\t" 1353 "movq %%mm0, %%mm1 \n\t"
1354 "movq %%mm0, %%mm2 \n\t" 1354 "movq %%mm0, %%mm2 \n\t"
1355 "pslld $16, %%mm0 \n\t" 1355 "pslld $16, %%mm0 \n\t"
1356 "psrld $16, %%mm1 \n\t" 1356 "psrld $16, %%mm1 \n\t"
1357 "pand "MANGLE(mask32r)", %%mm0 \n\t" 1357 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1358 "pand "MANGLE(mask32g)", %%mm2 \n\t" 1358 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1359 "pand "MANGLE(mask32b)", %%mm1 \n\t" 1359 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1360 "por %%mm0, %%mm2 \n\t" 1360 "por %%mm0, %%mm2 \n\t"
1361 "por %%mm1, %%mm2 \n\t" 1361 "por %%mm1, %%mm2 \n\t"
1362 MOVNTQ" %%mm2, (%1, %%eax) \n\t" 1362 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
1363 "addl $8, %%eax \n\t" 1363 "add $8, %%"REG_a" \n\t"
1364 "cmpl %2, %%eax \n\t" 1364 "cmp %2, %%"REG_a" \n\t"
1365 " jb 1b \n\t" 1365 " jb 1b \n\t"
1366 :: "r" (src), "r"(dst), "r" (src_size-7) 1366 :: "r" (src), "r"(dst), "r" ((long)src_size-7)
1367 : "%eax" 1367 : "%"REG_a
1368 ); 1368 );
1369 1369
1370 __asm __volatile(SFENCE:::"memory"); 1370 __asm __volatile(SFENCE:::"memory");
1371 __asm __volatile(EMMS:::"memory"); 1371 __asm __volatile(EMMS:::"memory");
1372 #else 1372 #else
1389 1389
1390 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size) 1390 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1391 { 1391 {
1392 unsigned i; 1392 unsigned i;
1393 #ifdef HAVE_MMX 1393 #ifdef HAVE_MMX
1394 int mmx_size= 23 - src_size; 1394 long mmx_size= 23 - src_size;
1395 asm volatile ( 1395 asm volatile (
1396 "movq "MANGLE(mask24r)", %%mm5 \n\t" 1396 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1397 "movq "MANGLE(mask24g)", %%mm6 \n\t" 1397 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1398 "movq "MANGLE(mask24b)", %%mm7 \n\t" 1398 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1399 ".balign 16 \n\t" 1399 ".balign 16 \n\t"
1400 "1: \n\t" 1400 "1: \n\t"
1401 PREFETCH" 32(%1, %%eax) \n\t" 1401 PREFETCH" 32(%1, %%"REG_a") \n\t"
1402 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG 1402 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1403 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG 1403 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1404 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B 1404 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1405 "psllq $16, %%mm0 \n\t" // 00 BGR BGR 1405 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1406 "pand %%mm5, %%mm0 \n\t" 1406 "pand %%mm5, %%mm0 \n\t"
1407 "pand %%mm6, %%mm1 \n\t" 1407 "pand %%mm6, %%mm1 \n\t"
1408 "pand %%mm7, %%mm2 \n\t" 1408 "pand %%mm7, %%mm2 \n\t"
1409 "por %%mm0, %%mm1 \n\t" 1409 "por %%mm0, %%mm1 \n\t"
1410 "por %%mm2, %%mm1 \n\t" 1410 "por %%mm2, %%mm1 \n\t"
1411 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG 1411 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1412 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG 1412 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG
1413 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B 1413 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1414 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR 1414 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1415 "pand %%mm7, %%mm0 \n\t" 1415 "pand %%mm7, %%mm0 \n\t"
1416 "pand %%mm5, %%mm1 \n\t" 1416 "pand %%mm5, %%mm1 \n\t"
1417 "pand %%mm6, %%mm2 \n\t" 1417 "pand %%mm6, %%mm2 \n\t"
1418 "por %%mm0, %%mm1 \n\t" 1418 "por %%mm0, %%mm1 \n\t"
1419 "por %%mm2, %%mm1 \n\t" 1419 "por %%mm2, %%mm1 \n\t"
1420 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B 1420 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1421 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R 1421 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R
1422 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR 1422 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1423 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG 1423 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1424 "pand %%mm6, %%mm0 \n\t" 1424 "pand %%mm6, %%mm0 \n\t"
1425 "pand %%mm7, %%mm1 \n\t" 1425 "pand %%mm7, %%mm1 \n\t"
1426 "pand %%mm5, %%mm2 \n\t" 1426 "pand %%mm5, %%mm2 \n\t"
1427 "por %%mm0, %%mm1 \n\t" 1427 "por %%mm0, %%mm1 \n\t"
1428 "por %%mm2, %%mm1 \n\t" 1428 "por %%mm2, %%mm1 \n\t"
1429 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t" 1429 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1430 "addl $24, %%eax \n\t" 1430 "add $24, %%"REG_a" \n\t"
1431 " js 1b \n\t" 1431 " js 1b \n\t"
1432 : "+a" (mmx_size) 1432 : "+a" (mmx_size)
1433 : "r" (src-mmx_size), "r"(dst-mmx_size) 1433 : "r" (src-mmx_size), "r"(dst-mmx_size)
1434 ); 1434 );
1435 1435
1463 for(y=0; y<height; y++) 1463 for(y=0; y<height; y++)
1464 { 1464 {
1465 #ifdef HAVE_MMX 1465 #ifdef HAVE_MMX
1466 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) 1466 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1467 asm volatile( 1467 asm volatile(
1468 "xorl %%eax, %%eax \n\t" 1468 "xor %%"REG_a", %%"REG_a" \n\t"
1469 ".balign 16 \n\t" 1469 ".balign 16 \n\t"
1470 "1: \n\t" 1470 "1: \n\t"
1471 PREFETCH" 32(%1, %%eax, 2) \n\t" 1471 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1472 PREFETCH" 32(%2, %%eax) \n\t" 1472 PREFETCH" 32(%2, %%"REG_a") \n\t"
1473 PREFETCH" 32(%3, %%eax) \n\t" 1473 PREFETCH" 32(%3, %%"REG_a") \n\t"
1474 "movq (%2, %%eax), %%mm0 \n\t" // U(0) 1474 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1475 "movq %%mm0, %%mm2 \n\t" // U(0) 1475 "movq %%mm0, %%mm2 \n\t" // U(0)
1476 "movq (%3, %%eax), %%mm1 \n\t" // V(0) 1476 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1477 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 1477 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1478 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) 1478 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1479 1479
1480 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0) 1480 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1481 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8) 1481 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1482 "movq %%mm3, %%mm4 \n\t" // Y(0) 1482 "movq %%mm3, %%mm4 \n\t" // Y(0)
1483 "movq %%mm5, %%mm6 \n\t" // Y(8) 1483 "movq %%mm5, %%mm6 \n\t" // Y(8)
1484 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) 1484 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1485 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) 1485 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1486 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) 1486 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1487 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) 1487 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1488 1488
1489 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t" 1489 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1490 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t" 1490 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1491 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t" 1491 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1492 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t" 1492 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1493 1493
1494 "addl $8, %%eax \n\t" 1494 "add $8, %%"REG_a" \n\t"
1495 "cmpl %4, %%eax \n\t" 1495 "cmp %4, %%"REG_a" \n\t"
1496 " jb 1b \n\t" 1496 " jb 1b \n\t"
1497 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) 1497 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth)
1498 : "%eax" 1498 : "%"REG_a
1499 ); 1499 );
1500 #else 1500 #else
1501 1501
1502 #if defined ARCH_ALPHA && defined HAVE_MVI 1502 #if defined ARCH_ALPHA && defined HAVE_MVI
1503 #define pl2yuy2(n) \ 1503 #define pl2yuy2(n) \
1616 for(y=0; y<height; y++) 1616 for(y=0; y<height; y++)
1617 { 1617 {
1618 #ifdef HAVE_MMX 1618 #ifdef HAVE_MMX
1619 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) 1619 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1620 asm volatile( 1620 asm volatile(
1621 "xorl %%eax, %%eax \n\t" 1621 "xor %%"REG_a", %%"REG_a" \n\t"
1622 ".balign 16 \n\t" 1622 ".balign 16 \n\t"
1623 "1: \n\t" 1623 "1: \n\t"
1624 PREFETCH" 32(%1, %%eax, 2) \n\t" 1624 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1625 PREFETCH" 32(%2, %%eax) \n\t" 1625 PREFETCH" 32(%2, %%"REG_a") \n\t"
1626 PREFETCH" 32(%3, %%eax) \n\t" 1626 PREFETCH" 32(%3, %%"REG_a") \n\t"
1627 "movq (%2, %%eax), %%mm0 \n\t" // U(0) 1627 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1628 "movq %%mm0, %%mm2 \n\t" // U(0) 1628 "movq %%mm0, %%mm2 \n\t" // U(0)
1629 "movq (%3, %%eax), %%mm1 \n\t" // V(0) 1629 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1630 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 1630 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1631 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) 1631 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1632 1632
1633 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0) 1633 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1634 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8) 1634 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1635 "movq %%mm0, %%mm4 \n\t" // Y(0) 1635 "movq %%mm0, %%mm4 \n\t" // Y(0)
1636 "movq %%mm2, %%mm6 \n\t" // Y(8) 1636 "movq %%mm2, %%mm6 \n\t" // Y(8)
1637 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) 1637 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1638 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) 1638 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1639 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) 1639 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1640 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) 1640 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1641 1641
1642 MOVNTQ" %%mm0, (%0, %%eax, 4) \n\t" 1642 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1643 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t" 1643 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1644 MOVNTQ" %%mm2, 16(%0, %%eax, 4) \n\t" 1644 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1645 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t" 1645 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1646 1646
1647 "addl $8, %%eax \n\t" 1647 "add $8, %%"REG_a" \n\t"
1648 "cmpl %4, %%eax \n\t" 1648 "cmp %4, %%"REG_a" \n\t"
1649 " jb 1b \n\t" 1649 " jb 1b \n\t"
1650 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) 1650 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth)
1651 : "%eax" 1651 : "%"REG_a
1652 ); 1652 );
1653 #else 1653 #else
1654 //FIXME adapt the alpha asm code from yv12->yuy2 1654 //FIXME adapt the alpha asm code from yv12->yuy2
1655 1655
1656 #if __WORDSIZE >= 64 1656 #if __WORDSIZE >= 64
1738 const unsigned chromWidth= width>>1; 1738 const unsigned chromWidth= width>>1;
1739 for(y=0; y<height; y+=2) 1739 for(y=0; y<height; y+=2)
1740 { 1740 {
1741 #ifdef HAVE_MMX 1741 #ifdef HAVE_MMX
1742 asm volatile( 1742 asm volatile(
1743 "xorl %%eax, %%eax \n\t" 1743 "xor %%"REG_a", %%"REG_a" \n\t"
1744 "pcmpeqw %%mm7, %%mm7 \n\t" 1744 "pcmpeqw %%mm7, %%mm7 \n\t"
1745 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... 1745 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1746 ".balign 16 \n\t" 1746 ".balign 16 \n\t"
1747 "1: \n\t" 1747 "1: \n\t"
1748 PREFETCH" 64(%0, %%eax, 4) \n\t" 1748 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1749 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) 1749 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1750 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) 1750 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1751 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) 1751 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1752 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) 1752 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1753 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) 1753 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1754 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) 1754 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1755 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) 1755 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1756 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) 1756 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1757 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 1757 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1758 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) 1758 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1759 1759
1760 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" 1760 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1761 1761
1762 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8) 1762 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1763 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12) 1763 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1764 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) 1764 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1765 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) 1765 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1766 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) 1766 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1767 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) 1767 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1768 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) 1768 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1769 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) 1769 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1770 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) 1770 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1771 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) 1771 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1772 1772
1773 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" 1773 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1774 1774
1775 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) 1775 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1776 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) 1776 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1777 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) 1777 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1778 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) 1778 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1779 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) 1779 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1780 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) 1780 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1781 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) 1781 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1782 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) 1782 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1783 1783
1784 MOVNTQ" %%mm0, (%3, %%eax) \n\t" 1784 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1785 MOVNTQ" %%mm2, (%2, %%eax) \n\t" 1785 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1786 1786
1787 "addl $8, %%eax \n\t" 1787 "add $8, %%"REG_a" \n\t"
1788 "cmpl %4, %%eax \n\t" 1788 "cmp %4, %%"REG_a" \n\t"
1789 " jb 1b \n\t" 1789 " jb 1b \n\t"
1790 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 1790 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth)
1791 : "memory", "%eax" 1791 : "memory", "%"REG_a
1792 ); 1792 );
1793 1793
1794 ydst += lumStride; 1794 ydst += lumStride;
1795 src += srcStride; 1795 src += srcStride;
1796 1796
1797 asm volatile( 1797 asm volatile(
1798 "xorl %%eax, %%eax \n\t" 1798 "xor %%"REG_a", %%"REG_a" \n\t"
1799 ".balign 16 \n\t" 1799 ".balign 16 \n\t"
1800 "1: \n\t" 1800 "1: \n\t"
1801 PREFETCH" 64(%0, %%eax, 4) \n\t" 1801 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1802 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) 1802 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1803 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) 1803 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1804 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) 1804 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1805 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) 1805 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1806 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) 1806 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1807 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) 1807 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1808 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) 1808 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1809 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) 1809 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1810 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) 1810 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1811 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) 1811 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1812 1812
1813 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" 1813 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1814 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" 1814 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1815 1815
1816 "addl $8, %%eax \n\t" 1816 "add $8, %%"REG_a" \n\t"
1817 "cmpl %4, %%eax \n\t" 1817 "cmp %4, %%"REG_a" \n\t"
1818 " jb 1b \n\t" 1818 " jb 1b \n\t"
1819 1819
1820 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 1820 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth)
1821 : "memory", "%eax" 1821 : "memory", "%"REG_a
1822 ); 1822 );
1823 #else 1823 #else
1824 unsigned i; 1824 unsigned i;
1825 for(i=0; i<chromWidth; i++) 1825 for(i=0; i<chromWidth; i++)
1826 { 1826 {
1875 1875
1876 dst+= dstStride; 1876 dst+= dstStride;
1877 1877
1878 for(y=1; y<srcHeight; y++){ 1878 for(y=1; y<srcHeight; y++){
1879 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1879 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1880 const int mmxSize= srcWidth&~15; 1880 const long mmxSize= srcWidth&~15;
1881 asm volatile( 1881 asm volatile(
1882 "movl %4, %%eax \n\t" 1882 "mov %4, %%"REG_a" \n\t"
1883 "1: \n\t" 1883 "1: \n\t"
1884 "movq (%0, %%eax), %%mm0 \n\t" 1884 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1885 "movq (%1, %%eax), %%mm1 \n\t" 1885 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1886 "movq 1(%0, %%eax), %%mm2 \n\t" 1886 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1887 "movq 1(%1, %%eax), %%mm3 \n\t" 1887 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1888 "movq -1(%0, %%eax), %%mm4 \n\t" 1888 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1889 "movq -1(%1, %%eax), %%mm5 \n\t" 1889 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1890 PAVGB" %%mm0, %%mm5 \n\t" 1890 PAVGB" %%mm0, %%mm5 \n\t"
1891 PAVGB" %%mm0, %%mm3 \n\t" 1891 PAVGB" %%mm0, %%mm3 \n\t"
1892 PAVGB" %%mm0, %%mm5 \n\t" 1892 PAVGB" %%mm0, %%mm5 \n\t"
1893 PAVGB" %%mm0, %%mm3 \n\t" 1893 PAVGB" %%mm0, %%mm3 \n\t"
1894 PAVGB" %%mm1, %%mm4 \n\t" 1894 PAVGB" %%mm1, %%mm4 \n\t"
1900 "punpcklbw %%mm3, %%mm5 \n\t" 1900 "punpcklbw %%mm3, %%mm5 \n\t"
1901 "punpckhbw %%mm3, %%mm7 \n\t" 1901 "punpckhbw %%mm3, %%mm7 \n\t"
1902 "punpcklbw %%mm2, %%mm4 \n\t" 1902 "punpcklbw %%mm2, %%mm4 \n\t"
1903 "punpckhbw %%mm2, %%mm6 \n\t" 1903 "punpckhbw %%mm2, %%mm6 \n\t"
1904 #if 1 1904 #if 1
1905 MOVNTQ" %%mm5, (%2, %%eax, 2) \n\t" 1905 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1906 MOVNTQ" %%mm7, 8(%2, %%eax, 2) \n\t" 1906 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1907 MOVNTQ" %%mm4, (%3, %%eax, 2) \n\t" 1907 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1908 MOVNTQ" %%mm6, 8(%3, %%eax, 2) \n\t" 1908 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1909 #else 1909 #else
1910 "movq %%mm5, (%2, %%eax, 2) \n\t" 1910 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1911 "movq %%mm7, 8(%2, %%eax, 2) \n\t" 1911 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1912 "movq %%mm4, (%3, %%eax, 2) \n\t" 1912 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1913 "movq %%mm6, 8(%3, %%eax, 2) \n\t" 1913 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1914 #endif 1914 #endif
1915 "addl $8, %%eax \n\t" 1915 "add $8, %%"REG_a" \n\t"
1916 " js 1b \n\t" 1916 " js 1b \n\t"
1917 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), 1917 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1918 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), 1918 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1919 "g" (-mmxSize) 1919 "g" (-mmxSize)
1920 : "%eax" 1920 : "%"REG_a
1921 1921
1922 ); 1922 );
1923 #else 1923 #else
1924 const int mmxSize=1; 1924 const int mmxSize=1;
1925 #endif 1925 #endif
2105 { 2105 {
2106 unsigned i; 2106 unsigned i;
2107 for(i=0; i<2; i++) 2107 for(i=0; i<2; i++)
2108 { 2108 {
2109 asm volatile( 2109 asm volatile(
2110 "movl %2, %%eax \n\t" 2110 "mov %2, %%"REG_a" \n\t"
2111 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" 2111 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2112 "movq "MANGLE(w1111)", %%mm5 \n\t" 2112 "movq "MANGLE(w1111)", %%mm5 \n\t"
2113 "pxor %%mm7, %%mm7 \n\t" 2113 "pxor %%mm7, %%mm7 \n\t"
2114 "leal (%%eax, %%eax, 2), %%ebx \n\t" 2114 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2115 ".balign 16 \n\t" 2115 ".balign 16 \n\t"
2116 "1: \n\t" 2116 "1: \n\t"
2117 PREFETCH" 64(%0, %%ebx) \n\t" 2117 PREFETCH" 64(%0, %%"REG_b") \n\t"
2118 "movd (%0, %%ebx), %%mm0 \n\t" 2118 "movd (%0, %%"REG_b"), %%mm0 \n\t"
2119 "movd 3(%0, %%ebx), %%mm1 \n\t" 2119 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
2120 "punpcklbw %%mm7, %%mm0 \n\t" 2120 "punpcklbw %%mm7, %%mm0 \n\t"
2121 "punpcklbw %%mm7, %%mm1 \n\t" 2121 "punpcklbw %%mm7, %%mm1 \n\t"
2122 "movd 6(%0, %%ebx), %%mm2 \n\t" 2122 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
2123 "movd 9(%0, %%ebx), %%mm3 \n\t" 2123 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
2124 "punpcklbw %%mm7, %%mm2 \n\t" 2124 "punpcklbw %%mm7, %%mm2 \n\t"
2125 "punpcklbw %%mm7, %%mm3 \n\t" 2125 "punpcklbw %%mm7, %%mm3 \n\t"
2126 "pmaddwd %%mm6, %%mm0 \n\t" 2126 "pmaddwd %%mm6, %%mm0 \n\t"
2127 "pmaddwd %%mm6, %%mm1 \n\t" 2127 "pmaddwd %%mm6, %%mm1 \n\t"
2128 "pmaddwd %%mm6, %%mm2 \n\t" 2128 "pmaddwd %%mm6, %%mm2 \n\t"
2138 "pmaddwd %%mm5, %%mm0 \n\t" 2138 "pmaddwd %%mm5, %%mm0 \n\t"
2139 "pmaddwd %%mm5, %%mm2 \n\t" 2139 "pmaddwd %%mm5, %%mm2 \n\t"
2140 "packssdw %%mm2, %%mm0 \n\t" 2140 "packssdw %%mm2, %%mm0 \n\t"
2141 "psraw $7, %%mm0 \n\t" 2141 "psraw $7, %%mm0 \n\t"
2142 2142
2143 "movd 12(%0, %%ebx), %%mm4 \n\t" 2143 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
2144 "movd 15(%0, %%ebx), %%mm1 \n\t" 2144 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
2145 "punpcklbw %%mm7, %%mm4 \n\t" 2145 "punpcklbw %%mm7, %%mm4 \n\t"
2146 "punpcklbw %%mm7, %%mm1 \n\t" 2146 "punpcklbw %%mm7, %%mm1 \n\t"
2147 "movd 18(%0, %%ebx), %%mm2 \n\t" 2147 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
2148 "movd 21(%0, %%ebx), %%mm3 \n\t" 2148 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
2149 "punpcklbw %%mm7, %%mm2 \n\t" 2149 "punpcklbw %%mm7, %%mm2 \n\t"
2150 "punpcklbw %%mm7, %%mm3 \n\t" 2150 "punpcklbw %%mm7, %%mm3 \n\t"
2151 "pmaddwd %%mm6, %%mm4 \n\t" 2151 "pmaddwd %%mm6, %%mm4 \n\t"
2152 "pmaddwd %%mm6, %%mm1 \n\t" 2152 "pmaddwd %%mm6, %%mm1 \n\t"
2153 "pmaddwd %%mm6, %%mm2 \n\t" 2153 "pmaddwd %%mm6, %%mm2 \n\t"
2160 #endif 2160 #endif
2161 "packssdw %%mm1, %%mm4 \n\t" 2161 "packssdw %%mm1, %%mm4 \n\t"
2162 "packssdw %%mm3, %%mm2 \n\t" 2162 "packssdw %%mm3, %%mm2 \n\t"
2163 "pmaddwd %%mm5, %%mm4 \n\t" 2163 "pmaddwd %%mm5, %%mm4 \n\t"
2164 "pmaddwd %%mm5, %%mm2 \n\t" 2164 "pmaddwd %%mm5, %%mm2 \n\t"
2165 "addl $24, %%ebx \n\t" 2165 "add $24, %%"REG_b" \n\t"
2166 "packssdw %%mm2, %%mm4 \n\t" 2166 "packssdw %%mm2, %%mm4 \n\t"
2167 "psraw $7, %%mm4 \n\t" 2167 "psraw $7, %%mm4 \n\t"
2168 2168
2169 "packuswb %%mm4, %%mm0 \n\t" 2169 "packuswb %%mm4, %%mm0 \n\t"
2170 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" 2170 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2171 2171
2172 MOVNTQ" %%mm0, (%1, %%eax) \n\t" 2172 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2173 "addl $8, %%eax \n\t" 2173 "add $8, %%"REG_a" \n\t"
2174 " js 1b \n\t" 2174 " js 1b \n\t"
2175 : : "r" (src+width*3), "r" (ydst+width), "g" (-width) 2175 : : "r" (src+width*3), "r" (ydst+width), "g" ((long)-width)
2176 : "%eax", "%ebx" 2176 : "%"REG_a, "%"REG_b
2177 ); 2177 );
2178 ydst += lumStride; 2178 ydst += lumStride;
2179 src += srcStride; 2179 src += srcStride;
2180 } 2180 }
2181 src -= srcStride*2; 2181 src -= srcStride*2;
2182 asm volatile( 2182 asm volatile(
2183 "movl %4, %%eax \n\t" 2183 "mov %4, %%"REG_a" \n\t"
2184 "movq "MANGLE(w1111)", %%mm5 \n\t" 2184 "movq "MANGLE(w1111)", %%mm5 \n\t"
2185 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" 2185 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2186 "pxor %%mm7, %%mm7 \n\t" 2186 "pxor %%mm7, %%mm7 \n\t"
2187 "leal (%%eax, %%eax, 2), %%ebx \n\t" 2187 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2188 "addl %%ebx, %%ebx \n\t" 2188 "add %%"REG_b", %%"REG_b" \n\t"
2189 ".balign 16 \n\t" 2189 ".balign 16 \n\t"
2190 "1: \n\t" 2190 "1: \n\t"
2191 PREFETCH" 64(%0, %%ebx) \n\t" 2191 PREFETCH" 64(%0, %%"REG_b") \n\t"
2192 PREFETCH" 64(%1, %%ebx) \n\t" 2192 PREFETCH" 64(%1, %%"REG_b") \n\t"
2193 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2193 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2194 "movq (%0, %%ebx), %%mm0 \n\t" 2194 "movq (%0, %%"REG_b"), %%mm0 \n\t"
2195 "movq (%1, %%ebx), %%mm1 \n\t" 2195 "movq (%1, %%"REG_b"), %%mm1 \n\t"
2196 "movq 6(%0, %%ebx), %%mm2 \n\t" 2196 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
2197 "movq 6(%1, %%ebx), %%mm3 \n\t" 2197 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
2198 PAVGB" %%mm1, %%mm0 \n\t" 2198 PAVGB" %%mm1, %%mm0 \n\t"
2199 PAVGB" %%mm3, %%mm2 \n\t" 2199 PAVGB" %%mm3, %%mm2 \n\t"
2200 "movq %%mm0, %%mm1 \n\t" 2200 "movq %%mm0, %%mm1 \n\t"
2201 "movq %%mm2, %%mm3 \n\t" 2201 "movq %%mm2, %%mm3 \n\t"
2202 "psrlq $24, %%mm0 \n\t" 2202 "psrlq $24, %%mm0 \n\t"
2204 PAVGB" %%mm1, %%mm0 \n\t" 2204 PAVGB" %%mm1, %%mm0 \n\t"
2205 PAVGB" %%mm3, %%mm2 \n\t" 2205 PAVGB" %%mm3, %%mm2 \n\t"
2206 "punpcklbw %%mm7, %%mm0 \n\t" 2206 "punpcklbw %%mm7, %%mm0 \n\t"
2207 "punpcklbw %%mm7, %%mm2 \n\t" 2207 "punpcklbw %%mm7, %%mm2 \n\t"
2208 #else 2208 #else
2209 "movd (%0, %%ebx), %%mm0 \n\t" 2209 "movd (%0, %%"REG_b"), %%mm0 \n\t"
2210 "movd (%1, %%ebx), %%mm1 \n\t" 2210 "movd (%1, %%"REG_b"), %%mm1 \n\t"
2211 "movd 3(%0, %%ebx), %%mm2 \n\t" 2211 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
2212 "movd 3(%1, %%ebx), %%mm3 \n\t" 2212 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
2213 "punpcklbw %%mm7, %%mm0 \n\t" 2213 "punpcklbw %%mm7, %%mm0 \n\t"
2214 "punpcklbw %%mm7, %%mm1 \n\t" 2214 "punpcklbw %%mm7, %%mm1 \n\t"
2215 "punpcklbw %%mm7, %%mm2 \n\t" 2215 "punpcklbw %%mm7, %%mm2 \n\t"
2216 "punpcklbw %%mm7, %%mm3 \n\t" 2216 "punpcklbw %%mm7, %%mm3 \n\t"
2217 "paddw %%mm1, %%mm0 \n\t" 2217 "paddw %%mm1, %%mm0 \n\t"
2218 "paddw %%mm3, %%mm2 \n\t" 2218 "paddw %%mm3, %%mm2 \n\t"
2219 "paddw %%mm2, %%mm0 \n\t" 2219 "paddw %%mm2, %%mm0 \n\t"
2220 "movd 6(%0, %%ebx), %%mm4 \n\t" 2220 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
2221 "movd 6(%1, %%ebx), %%mm1 \n\t" 2221 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
2222 "movd 9(%0, %%ebx), %%mm2 \n\t" 2222 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
2223 "movd 9(%1, %%ebx), %%mm3 \n\t" 2223 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
2224 "punpcklbw %%mm7, %%mm4 \n\t" 2224 "punpcklbw %%mm7, %%mm4 \n\t"
2225 "punpcklbw %%mm7, %%mm1 \n\t" 2225 "punpcklbw %%mm7, %%mm1 \n\t"
2226 "punpcklbw %%mm7, %%mm2 \n\t" 2226 "punpcklbw %%mm7, %%mm2 \n\t"
2227 "punpcklbw %%mm7, %%mm3 \n\t" 2227 "punpcklbw %%mm7, %%mm3 \n\t"
2228 "paddw %%mm1, %%mm4 \n\t" 2228 "paddw %%mm1, %%mm4 \n\t"
2250 "pmaddwd %%mm5, %%mm1 \n\t" 2250 "pmaddwd %%mm5, %%mm1 \n\t"
2251 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 2251 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2252 "psraw $7, %%mm0 \n\t" 2252 "psraw $7, %%mm0 \n\t"
2253 2253
2254 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2254 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2255 "movq 12(%0, %%ebx), %%mm4 \n\t" 2255 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
2256 "movq 12(%1, %%ebx), %%mm1 \n\t" 2256 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
2257 "movq 18(%0, %%ebx), %%mm2 \n\t" 2257 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
2258 "movq 18(%1, %%ebx), %%mm3 \n\t" 2258 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
2259 PAVGB" %%mm1, %%mm4 \n\t" 2259 PAVGB" %%mm1, %%mm4 \n\t"
2260 PAVGB" %%mm3, %%mm2 \n\t" 2260 PAVGB" %%mm3, %%mm2 \n\t"
2261 "movq %%mm4, %%mm1 \n\t" 2261 "movq %%mm4, %%mm1 \n\t"
2262 "movq %%mm2, %%mm3 \n\t" 2262 "movq %%mm2, %%mm3 \n\t"
2263 "psrlq $24, %%mm4 \n\t" 2263 "psrlq $24, %%mm4 \n\t"
2265 PAVGB" %%mm1, %%mm4 \n\t" 2265 PAVGB" %%mm1, %%mm4 \n\t"
2266 PAVGB" %%mm3, %%mm2 \n\t" 2266 PAVGB" %%mm3, %%mm2 \n\t"
2267 "punpcklbw %%mm7, %%mm4 \n\t" 2267 "punpcklbw %%mm7, %%mm4 \n\t"
2268 "punpcklbw %%mm7, %%mm2 \n\t" 2268 "punpcklbw %%mm7, %%mm2 \n\t"
2269 #else 2269 #else
2270 "movd 12(%0, %%ebx), %%mm4 \n\t" 2270 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
2271 "movd 12(%1, %%ebx), %%mm1 \n\t" 2271 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
2272 "movd 15(%0, %%ebx), %%mm2 \n\t" 2272 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
2273 "movd 15(%1, %%ebx), %%mm3 \n\t" 2273 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
2274 "punpcklbw %%mm7, %%mm4 \n\t" 2274 "punpcklbw %%mm7, %%mm4 \n\t"
2275 "punpcklbw %%mm7, %%mm1 \n\t" 2275 "punpcklbw %%mm7, %%mm1 \n\t"
2276 "punpcklbw %%mm7, %%mm2 \n\t" 2276 "punpcklbw %%mm7, %%mm2 \n\t"
2277 "punpcklbw %%mm7, %%mm3 \n\t" 2277 "punpcklbw %%mm7, %%mm3 \n\t"
2278 "paddw %%mm1, %%mm4 \n\t" 2278 "paddw %%mm1, %%mm4 \n\t"
2279 "paddw %%mm3, %%mm2 \n\t" 2279 "paddw %%mm3, %%mm2 \n\t"
2280 "paddw %%mm2, %%mm4 \n\t" 2280 "paddw %%mm2, %%mm4 \n\t"
2281 "movd 18(%0, %%ebx), %%mm5 \n\t" 2281 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
2282 "movd 18(%1, %%ebx), %%mm1 \n\t" 2282 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
2283 "movd 21(%0, %%ebx), %%mm2 \n\t" 2283 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
2284 "movd 21(%1, %%ebx), %%mm3 \n\t" 2284 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
2285 "punpcklbw %%mm7, %%mm5 \n\t" 2285 "punpcklbw %%mm7, %%mm5 \n\t"
2286 "punpcklbw %%mm7, %%mm1 \n\t" 2286 "punpcklbw %%mm7, %%mm1 \n\t"
2287 "punpcklbw %%mm7, %%mm2 \n\t" 2287 "punpcklbw %%mm7, %%mm2 \n\t"
2288 "punpcklbw %%mm7, %%mm3 \n\t" 2288 "punpcklbw %%mm7, %%mm3 \n\t"
2289 "paddw %%mm1, %%mm5 \n\t" 2289 "paddw %%mm1, %%mm5 \n\t"
2308 #endif 2308 #endif
2309 "packssdw %%mm2, %%mm4 \n\t" 2309 "packssdw %%mm2, %%mm4 \n\t"
2310 "packssdw %%mm3, %%mm1 \n\t" 2310 "packssdw %%mm3, %%mm1 \n\t"
2311 "pmaddwd %%mm5, %%mm4 \n\t" 2311 "pmaddwd %%mm5, %%mm4 \n\t"
2312 "pmaddwd %%mm5, %%mm1 \n\t" 2312 "pmaddwd %%mm5, %%mm1 \n\t"
2313 "addl $24, %%ebx \n\t" 2313 "add $24, %%"REG_b" \n\t"
2314 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 2314 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2315 "psraw $7, %%mm4 \n\t" 2315 "psraw $7, %%mm4 \n\t"
2316 2316
2317 "movq %%mm0, %%mm1 \n\t" 2317 "movq %%mm0, %%mm1 \n\t"
2318 "punpckldq %%mm4, %%mm0 \n\t" 2318 "punpckldq %%mm4, %%mm0 \n\t"
2319 "punpckhdq %%mm4, %%mm1 \n\t" 2319 "punpckhdq %%mm4, %%mm1 \n\t"
2320 "packsswb %%mm1, %%mm0 \n\t" 2320 "packsswb %%mm1, %%mm0 \n\t"
2321 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" 2321 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2322 2322 "movd %%mm0, (%2, %%"REG_a") \n\t"
2323 "movd %%mm0, (%2, %%eax) \n\t"
2324 "punpckhdq %%mm0, %%mm0 \n\t" 2323 "punpckhdq %%mm0, %%mm0 \n\t"
2325 "movd %%mm0, (%3, %%eax) \n\t" 2324 "movd %%mm0, (%3, %%"REG_a") \n\t"
2326 "addl $4, %%eax \n\t" 2325 "add $4, %%"REG_a" \n\t"
2327 " js 1b \n\t" 2326 " js 1b \n\t"
2328 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) 2327 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" ((long)-chromWidth)
2329 : "%eax", "%ebx" 2328 : "%"REG_a, "%"REG_b
2330 ); 2329 );
2331 2330
2332 udst += chromStride; 2331 udst += chromStride;
2333 vdst += chromStride; 2332 vdst += chromStride;
2334 src += srcStride*2; 2333 src += srcStride*2;
2401 unsigned w; 2400 unsigned w;
2402 2401
2403 #ifdef HAVE_MMX 2402 #ifdef HAVE_MMX
2404 #ifdef HAVE_SSE2 2403 #ifdef HAVE_SSE2
2405 asm( 2404 asm(
2406 "xorl %%eax, %%eax \n\t" 2405 "xor %%"REG_a", %%"REG_a" \n\t"
2407 "1: \n\t" 2406 "1: \n\t"
2408 PREFETCH" 64(%1, %%eax) \n\t" 2407 PREFETCH" 64(%1, %%"REG_a") \n\t"
2409 PREFETCH" 64(%2, %%eax) \n\t" 2408 PREFETCH" 64(%2, %%"REG_a") \n\t"
2410 "movdqa (%1, %%eax), %%xmm0 \n\t" 2409 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2411 "movdqa (%1, %%eax), %%xmm1 \n\t" 2410 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2412 "movdqa (%2, %%eax), %%xmm2 \n\t" 2411 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2413 "punpcklbw %%xmm2, %%xmm0 \n\t" 2412 "punpcklbw %%xmm2, %%xmm0 \n\t"
2414 "punpckhbw %%xmm2, %%xmm1 \n\t" 2413 "punpckhbw %%xmm2, %%xmm1 \n\t"
2415 "movntdq %%xmm0, (%0, %%eax, 2) \n\t" 2414 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2416 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t" 2415 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2417 "addl $16, %%eax \n\t" 2416 "add $16, %%"REG_a" \n\t"
2418 "cmpl %3, %%eax \n\t" 2417 "cmp %3, %%"REG_a" \n\t"
2419 " jb 1b \n\t" 2418 " jb 1b \n\t"
2420 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) 2419 ::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15)
2421 : "memory", "%eax" 2420 : "memory", "%"REG_a""
2422 ); 2421 );
2423 #else 2422 #else
2424 asm( 2423 asm(
2425 "xorl %%eax, %%eax \n\t" 2424 "xor %%"REG_a", %%"REG_a" \n\t"
2426 "1: \n\t" 2425 "1: \n\t"
2427 PREFETCH" 64(%1, %%eax) \n\t" 2426 PREFETCH" 64(%1, %%"REG_a") \n\t"
2428 PREFETCH" 64(%2, %%eax) \n\t" 2427 PREFETCH" 64(%2, %%"REG_a") \n\t"
2429 "movq (%1, %%eax), %%mm0 \n\t" 2428 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2430 "movq 8(%1, %%eax), %%mm2 \n\t" 2429 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2431 "movq %%mm0, %%mm1 \n\t" 2430 "movq %%mm0, %%mm1 \n\t"
2432 "movq %%mm2, %%mm3 \n\t" 2431 "movq %%mm2, %%mm3 \n\t"
2433 "movq (%2, %%eax), %%mm4 \n\t" 2432 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2434 "movq 8(%2, %%eax), %%mm5 \n\t" 2433 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2435 "punpcklbw %%mm4, %%mm0 \n\t" 2434 "punpcklbw %%mm4, %%mm0 \n\t"
2436 "punpckhbw %%mm4, %%mm1 \n\t" 2435 "punpckhbw %%mm4, %%mm1 \n\t"
2437 "punpcklbw %%mm5, %%mm2 \n\t" 2436 "punpcklbw %%mm5, %%mm2 \n\t"
2438 "punpckhbw %%mm5, %%mm3 \n\t" 2437 "punpckhbw %%mm5, %%mm3 \n\t"
2439 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t" 2438 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2440 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t" 2439 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2441 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t" 2440 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2442 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t" 2441 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2443 "addl $16, %%eax \n\t" 2442 "add $16, %%"REG_a" \n\t"
2444 "cmpl %3, %%eax \n\t" 2443 "cmp %3, %%"REG_a" \n\t"
2445 " jb 1b \n\t" 2444 " jb 1b \n\t"
2446 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) 2445 ::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15)
2447 : "memory", "%eax" 2446 : "memory", "%"REG_a
2448 ); 2447 );
2449 #endif 2448 #endif
2450 for(w= (width&(~15)); w < width; w++) 2449 for(w= (width&(~15)); w < width; w++)
2451 { 2450 {
2452 dest[2*w+0] = src1[w]; 2451 dest[2*w+0] = src1[w];
2580 uint8_t *dst, 2579 uint8_t *dst,
2581 unsigned width, unsigned height, 2580 unsigned width, unsigned height,
2582 int srcStride1, int srcStride2, 2581 int srcStride1, int srcStride2,
2583 int srcStride3, int dstStride) 2582 int srcStride3, int dstStride)
2584 { 2583 {
2585 unsigned y,x,w,h; 2584 unsigned long y,x,w,h;
2586 w=width/2; h=height; 2585 w=width/2; h=height;
2587 for(y=0;y<h;y++){ 2586 for(y=0;y<h;y++){
2588 const uint8_t* yp=src1+srcStride1*y; 2587 const uint8_t* yp=src1+srcStride1*y;
2589 const uint8_t* up=src2+srcStride2*(y>>2); 2588 const uint8_t* up=src2+srcStride2*(y>>2);
2590 const uint8_t* vp=src3+srcStride3*(y>>2); 2589 const uint8_t* vp=src3+srcStride3*(y>>2);