Mercurial > mplayer.hg
comparison postproc/rgb2rgb_template.c @ 13720:821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
author | aurel |
---|---|
date | Thu, 21 Oct 2004 11:55:20 +0000 |
parents | 7f5ea5da1765 |
children | 49dd10a86b23 |
comparison
equal
deleted
inserted
replaced
13719:43ecd6a73ec0 | 13720:821f464b4d90 |
---|---|
347 "por %%mm4, %%mm3 \n\t" | 347 "por %%mm4, %%mm3 \n\t" |
348 "psrld $5, %%mm0 \n\t" | 348 "psrld $5, %%mm0 \n\t" |
349 "pslld $11, %%mm3 \n\t" | 349 "pslld $11, %%mm3 \n\t" |
350 "por %%mm3, %%mm0 \n\t" | 350 "por %%mm3, %%mm0 \n\t" |
351 MOVNTQ" %%mm0, (%0) \n\t" | 351 MOVNTQ" %%mm0, (%0) \n\t" |
352 "addl $16, %1 \n\t" | 352 "add $16, %1 \n\t" |
353 "addl $8, %0 \n\t" | 353 "add $8, %0 \n\t" |
354 "cmpl %2, %1 \n\t" | 354 "cmp %2, %1 \n\t" |
355 " jb 1b \n\t" | 355 " jb 1b \n\t" |
356 : "+r" (d), "+r"(s) | 356 : "+r" (d), "+r"(s) |
357 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) | 357 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) |
358 ); | 358 ); |
359 #else | 359 #else |
507 "por %%mm4, %%mm3 \n\t" | 507 "por %%mm4, %%mm3 \n\t" |
508 "psrld $6, %%mm0 \n\t" | 508 "psrld $6, %%mm0 \n\t" |
509 "pslld $10, %%mm3 \n\t" | 509 "pslld $10, %%mm3 \n\t" |
510 "por %%mm3, %%mm0 \n\t" | 510 "por %%mm3, %%mm0 \n\t" |
511 MOVNTQ" %%mm0, (%0) \n\t" | 511 MOVNTQ" %%mm0, (%0) \n\t" |
512 "addl $16, %1 \n\t" | 512 "add $16, %1 \n\t" |
513 "addl $8, %0 \n\t" | 513 "add $8, %0 \n\t" |
514 "cmpl %2, %1 \n\t" | 514 "cmp %2, %1 \n\t" |
515 " jb 1b \n\t" | 515 " jb 1b \n\t" |
516 : "+r" (d), "+r"(s) | 516 : "+r" (d), "+r"(s) |
517 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) | 517 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) |
518 ); | 518 ); |
519 #else | 519 #else |
1343 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size) | 1343 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size) |
1344 { | 1344 { |
1345 #ifdef HAVE_MMX | 1345 #ifdef HAVE_MMX |
1346 /* TODO: unroll this loop */ | 1346 /* TODO: unroll this loop */ |
1347 asm volatile ( | 1347 asm volatile ( |
1348 "xorl %%eax, %%eax \n\t" | 1348 "xor %%"REG_a", %%"REG_a" \n\t" |
1349 ".balign 16 \n\t" | 1349 ".balign 16 \n\t" |
1350 "1: \n\t" | 1350 "1: \n\t" |
1351 PREFETCH" 32(%0, %%eax) \n\t" | 1351 PREFETCH" 32(%0, %%"REG_a") \n\t" |
1352 "movq (%0, %%eax), %%mm0 \n\t" | 1352 "movq (%0, %%"REG_a"), %%mm0 \n\t" |
1353 "movq %%mm0, %%mm1 \n\t" | 1353 "movq %%mm0, %%mm1 \n\t" |
1354 "movq %%mm0, %%mm2 \n\t" | 1354 "movq %%mm0, %%mm2 \n\t" |
1355 "pslld $16, %%mm0 \n\t" | 1355 "pslld $16, %%mm0 \n\t" |
1356 "psrld $16, %%mm1 \n\t" | 1356 "psrld $16, %%mm1 \n\t" |
1357 "pand "MANGLE(mask32r)", %%mm0 \n\t" | 1357 "pand "MANGLE(mask32r)", %%mm0 \n\t" |
1358 "pand "MANGLE(mask32g)", %%mm2 \n\t" | 1358 "pand "MANGLE(mask32g)", %%mm2 \n\t" |
1359 "pand "MANGLE(mask32b)", %%mm1 \n\t" | 1359 "pand "MANGLE(mask32b)", %%mm1 \n\t" |
1360 "por %%mm0, %%mm2 \n\t" | 1360 "por %%mm0, %%mm2 \n\t" |
1361 "por %%mm1, %%mm2 \n\t" | 1361 "por %%mm1, %%mm2 \n\t" |
1362 MOVNTQ" %%mm2, (%1, %%eax) \n\t" | 1362 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t" |
1363 "addl $8, %%eax \n\t" | 1363 "add $8, %%"REG_a" \n\t" |
1364 "cmpl %2, %%eax \n\t" | 1364 "cmp %2, %%"REG_a" \n\t" |
1365 " jb 1b \n\t" | 1365 " jb 1b \n\t" |
1366 :: "r" (src), "r"(dst), "r" (src_size-7) | 1366 :: "r" (src), "r"(dst), "r" ((long)src_size-7) |
1367 : "%eax" | 1367 : "%"REG_a |
1368 ); | 1368 ); |
1369 | 1369 |
1370 __asm __volatile(SFENCE:::"memory"); | 1370 __asm __volatile(SFENCE:::"memory"); |
1371 __asm __volatile(EMMS:::"memory"); | 1371 __asm __volatile(EMMS:::"memory"); |
1372 #else | 1372 #else |
1389 | 1389 |
1390 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size) | 1390 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size) |
1391 { | 1391 { |
1392 unsigned i; | 1392 unsigned i; |
1393 #ifdef HAVE_MMX | 1393 #ifdef HAVE_MMX |
1394 int mmx_size= 23 - src_size; | 1394 long mmx_size= 23 - src_size; |
1395 asm volatile ( | 1395 asm volatile ( |
1396 "movq "MANGLE(mask24r)", %%mm5 \n\t" | 1396 "movq "MANGLE(mask24r)", %%mm5 \n\t" |
1397 "movq "MANGLE(mask24g)", %%mm6 \n\t" | 1397 "movq "MANGLE(mask24g)", %%mm6 \n\t" |
1398 "movq "MANGLE(mask24b)", %%mm7 \n\t" | 1398 "movq "MANGLE(mask24b)", %%mm7 \n\t" |
1399 ".balign 16 \n\t" | 1399 ".balign 16 \n\t" |
1400 "1: \n\t" | 1400 "1: \n\t" |
1401 PREFETCH" 32(%1, %%eax) \n\t" | 1401 PREFETCH" 32(%1, %%"REG_a") \n\t" |
1402 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG | 1402 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG |
1403 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG | 1403 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG |
1404 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B | 1404 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B |
1405 "psllq $16, %%mm0 \n\t" // 00 BGR BGR | 1405 "psllq $16, %%mm0 \n\t" // 00 BGR BGR |
1406 "pand %%mm5, %%mm0 \n\t" | 1406 "pand %%mm5, %%mm0 \n\t" |
1407 "pand %%mm6, %%mm1 \n\t" | 1407 "pand %%mm6, %%mm1 \n\t" |
1408 "pand %%mm7, %%mm2 \n\t" | 1408 "pand %%mm7, %%mm2 \n\t" |
1409 "por %%mm0, %%mm1 \n\t" | 1409 "por %%mm0, %%mm1 \n\t" |
1410 "por %%mm2, %%mm1 \n\t" | 1410 "por %%mm2, %%mm1 \n\t" |
1411 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG | 1411 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG |
1412 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG | 1412 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG |
1413 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B | 1413 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B |
1414 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR | 1414 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR |
1415 "pand %%mm7, %%mm0 \n\t" | 1415 "pand %%mm7, %%mm0 \n\t" |
1416 "pand %%mm5, %%mm1 \n\t" | 1416 "pand %%mm5, %%mm1 \n\t" |
1417 "pand %%mm6, %%mm2 \n\t" | 1417 "pand %%mm6, %%mm2 \n\t" |
1418 "por %%mm0, %%mm1 \n\t" | 1418 "por %%mm0, %%mm1 \n\t" |
1419 "por %%mm2, %%mm1 \n\t" | 1419 "por %%mm2, %%mm1 \n\t" |
1420 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B | 1420 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B |
1421 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R | 1421 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R |
1422 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR | 1422 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR |
1423 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG | 1423 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG |
1424 "pand %%mm6, %%mm0 \n\t" | 1424 "pand %%mm6, %%mm0 \n\t" |
1425 "pand %%mm7, %%mm1 \n\t" | 1425 "pand %%mm7, %%mm1 \n\t" |
1426 "pand %%mm5, %%mm2 \n\t" | 1426 "pand %%mm5, %%mm2 \n\t" |
1427 "por %%mm0, %%mm1 \n\t" | 1427 "por %%mm0, %%mm1 \n\t" |
1428 "por %%mm2, %%mm1 \n\t" | 1428 "por %%mm2, %%mm1 \n\t" |
1429 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t" | 1429 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t" |
1430 "addl $24, %%eax \n\t" | 1430 "add $24, %%"REG_a" \n\t" |
1431 " js 1b \n\t" | 1431 " js 1b \n\t" |
1432 : "+a" (mmx_size) | 1432 : "+a" (mmx_size) |
1433 : "r" (src-mmx_size), "r"(dst-mmx_size) | 1433 : "r" (src-mmx_size), "r"(dst-mmx_size) |
1434 ); | 1434 ); |
1435 | 1435 |
1463 for(y=0; y<height; y++) | 1463 for(y=0; y<height; y++) |
1464 { | 1464 { |
1465 #ifdef HAVE_MMX | 1465 #ifdef HAVE_MMX |
1466 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) | 1466 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) |
1467 asm volatile( | 1467 asm volatile( |
1468 "xorl %%eax, %%eax \n\t" | 1468 "xor %%"REG_a", %%"REG_a" \n\t" |
1469 ".balign 16 \n\t" | 1469 ".balign 16 \n\t" |
1470 "1: \n\t" | 1470 "1: \n\t" |
1471 PREFETCH" 32(%1, %%eax, 2) \n\t" | 1471 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" |
1472 PREFETCH" 32(%2, %%eax) \n\t" | 1472 PREFETCH" 32(%2, %%"REG_a") \n\t" |
1473 PREFETCH" 32(%3, %%eax) \n\t" | 1473 PREFETCH" 32(%3, %%"REG_a") \n\t" |
1474 "movq (%2, %%eax), %%mm0 \n\t" // U(0) | 1474 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) |
1475 "movq %%mm0, %%mm2 \n\t" // U(0) | 1475 "movq %%mm0, %%mm2 \n\t" // U(0) |
1476 "movq (%3, %%eax), %%mm1 \n\t" // V(0) | 1476 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) |
1477 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | 1477 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) |
1478 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | 1478 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) |
1479 | 1479 |
1480 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0) | 1480 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) |
1481 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8) | 1481 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) |
1482 "movq %%mm3, %%mm4 \n\t" // Y(0) | 1482 "movq %%mm3, %%mm4 \n\t" // Y(0) |
1483 "movq %%mm5, %%mm6 \n\t" // Y(8) | 1483 "movq %%mm5, %%mm6 \n\t" // Y(8) |
1484 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | 1484 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) |
1485 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | 1485 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) |
1486 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | 1486 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) |
1487 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | 1487 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) |
1488 | 1488 |
1489 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t" | 1489 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t" |
1490 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t" | 1490 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t" |
1491 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t" | 1491 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t" |
1492 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t" | 1492 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t" |
1493 | 1493 |
1494 "addl $8, %%eax \n\t" | 1494 "add $8, %%"REG_a" \n\t" |
1495 "cmpl %4, %%eax \n\t" | 1495 "cmp %4, %%"REG_a" \n\t" |
1496 " jb 1b \n\t" | 1496 " jb 1b \n\t" |
1497 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) | 1497 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth) |
1498 : "%eax" | 1498 : "%"REG_a |
1499 ); | 1499 ); |
1500 #else | 1500 #else |
1501 | 1501 |
1502 #if defined ARCH_ALPHA && defined HAVE_MVI | 1502 #if defined ARCH_ALPHA && defined HAVE_MVI |
1503 #define pl2yuy2(n) \ | 1503 #define pl2yuy2(n) \ |
1616 for(y=0; y<height; y++) | 1616 for(y=0; y<height; y++) |
1617 { | 1617 { |
1618 #ifdef HAVE_MMX | 1618 #ifdef HAVE_MMX |
1619 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) | 1619 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway) |
1620 asm volatile( | 1620 asm volatile( |
1621 "xorl %%eax, %%eax \n\t" | 1621 "xor %%"REG_a", %%"REG_a" \n\t" |
1622 ".balign 16 \n\t" | 1622 ".balign 16 \n\t" |
1623 "1: \n\t" | 1623 "1: \n\t" |
1624 PREFETCH" 32(%1, %%eax, 2) \n\t" | 1624 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" |
1625 PREFETCH" 32(%2, %%eax) \n\t" | 1625 PREFETCH" 32(%2, %%"REG_a") \n\t" |
1626 PREFETCH" 32(%3, %%eax) \n\t" | 1626 PREFETCH" 32(%3, %%"REG_a") \n\t" |
1627 "movq (%2, %%eax), %%mm0 \n\t" // U(0) | 1627 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) |
1628 "movq %%mm0, %%mm2 \n\t" // U(0) | 1628 "movq %%mm0, %%mm2 \n\t" // U(0) |
1629 "movq (%3, %%eax), %%mm1 \n\t" // V(0) | 1629 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) |
1630 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | 1630 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) |
1631 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | 1631 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) |
1632 | 1632 |
1633 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0) | 1633 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) |
1634 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8) | 1634 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) |
1635 "movq %%mm0, %%mm4 \n\t" // Y(0) | 1635 "movq %%mm0, %%mm4 \n\t" // Y(0) |
1636 "movq %%mm2, %%mm6 \n\t" // Y(8) | 1636 "movq %%mm2, %%mm6 \n\t" // Y(8) |
1637 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) | 1637 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) |
1638 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) | 1638 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) |
1639 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) | 1639 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) |
1640 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) | 1640 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) |
1641 | 1641 |
1642 MOVNTQ" %%mm0, (%0, %%eax, 4) \n\t" | 1642 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t" |
1643 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t" | 1643 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t" |
1644 MOVNTQ" %%mm2, 16(%0, %%eax, 4) \n\t" | 1644 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t" |
1645 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t" | 1645 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t" |
1646 | 1646 |
1647 "addl $8, %%eax \n\t" | 1647 "add $8, %%"REG_a" \n\t" |
1648 "cmpl %4, %%eax \n\t" | 1648 "cmp %4, %%"REG_a" \n\t" |
1649 " jb 1b \n\t" | 1649 " jb 1b \n\t" |
1650 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) | 1650 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth) |
1651 : "%eax" | 1651 : "%"REG_a |
1652 ); | 1652 ); |
1653 #else | 1653 #else |
1654 //FIXME adapt the alpha asm code from yv12->yuy2 | 1654 //FIXME adapt the alpha asm code from yv12->yuy2 |
1655 | 1655 |
1656 #if __WORDSIZE >= 64 | 1656 #if __WORDSIZE >= 64 |
1738 const unsigned chromWidth= width>>1; | 1738 const unsigned chromWidth= width>>1; |
1739 for(y=0; y<height; y+=2) | 1739 for(y=0; y<height; y+=2) |
1740 { | 1740 { |
1741 #ifdef HAVE_MMX | 1741 #ifdef HAVE_MMX |
1742 asm volatile( | 1742 asm volatile( |
1743 "xorl %%eax, %%eax \n\t" | 1743 "xor %%"REG_a", %%"REG_a" \n\t" |
1744 "pcmpeqw %%mm7, %%mm7 \n\t" | 1744 "pcmpeqw %%mm7, %%mm7 \n\t" |
1745 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | 1745 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... |
1746 ".balign 16 \n\t" | 1746 ".balign 16 \n\t" |
1747 "1: \n\t" | 1747 "1: \n\t" |
1748 PREFETCH" 64(%0, %%eax, 4) \n\t" | 1748 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" |
1749 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | 1749 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) |
1750 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | 1750 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4) |
1751 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) | 1751 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) |
1752 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | 1752 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) |
1753 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | 1753 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) |
1754 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | 1754 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) |
1755 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | 1755 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) |
1756 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | 1756 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) |
1757 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | 1757 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) |
1758 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | 1758 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) |
1759 | 1759 |
1760 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | 1760 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t" |
1761 | 1761 |
1762 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8) | 1762 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8) |
1763 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12) | 1763 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12) |
1764 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) | 1764 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) |
1765 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | 1765 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) |
1766 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | 1766 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) |
1767 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | 1767 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) |
1768 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | 1768 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) |
1769 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | 1769 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) |
1770 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | 1770 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) |
1771 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | 1771 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) |
1772 | 1772 |
1773 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" | 1773 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t" |
1774 | 1774 |
1775 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | 1775 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) |
1776 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | 1776 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) |
1777 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | 1777 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) |
1778 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | 1778 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) |
1779 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | 1779 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) |
1780 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | 1780 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) |
1781 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | 1781 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) |
1782 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | 1782 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) |
1783 | 1783 |
1784 MOVNTQ" %%mm0, (%3, %%eax) \n\t" | 1784 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" |
1785 MOVNTQ" %%mm2, (%2, %%eax) \n\t" | 1785 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" |
1786 | 1786 |
1787 "addl $8, %%eax \n\t" | 1787 "add $8, %%"REG_a" \n\t" |
1788 "cmpl %4, %%eax \n\t" | 1788 "cmp %4, %%"REG_a" \n\t" |
1789 " jb 1b \n\t" | 1789 " jb 1b \n\t" |
1790 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | 1790 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth) |
1791 : "memory", "%eax" | 1791 : "memory", "%"REG_a |
1792 ); | 1792 ); |
1793 | 1793 |
1794 ydst += lumStride; | 1794 ydst += lumStride; |
1795 src += srcStride; | 1795 src += srcStride; |
1796 | 1796 |
1797 asm volatile( | 1797 asm volatile( |
1798 "xorl %%eax, %%eax \n\t" | 1798 "xor %%"REG_a", %%"REG_a" \n\t" |
1799 ".balign 16 \n\t" | 1799 ".balign 16 \n\t" |
1800 "1: \n\t" | 1800 "1: \n\t" |
1801 PREFETCH" 64(%0, %%eax, 4) \n\t" | 1801 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" |
1802 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | 1802 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) |
1803 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | 1803 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4) |
1804 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | 1804 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8) |
1805 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | 1805 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12) |
1806 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | 1806 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) |
1807 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | 1807 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) |
1808 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | 1808 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) |
1809 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | 1809 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) |
1810 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | 1810 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) |
1811 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | 1811 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) |
1812 | 1812 |
1813 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" | 1813 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t" |
1814 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | 1814 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t" |
1815 | 1815 |
1816 "addl $8, %%eax \n\t" | 1816 "add $8, %%"REG_a" \n\t" |
1817 "cmpl %4, %%eax \n\t" | 1817 "cmp %4, %%"REG_a" \n\t" |
1818 " jb 1b \n\t" | 1818 " jb 1b \n\t" |
1819 | 1819 |
1820 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | 1820 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth) |
1821 : "memory", "%eax" | 1821 : "memory", "%"REG_a |
1822 ); | 1822 ); |
1823 #else | 1823 #else |
1824 unsigned i; | 1824 unsigned i; |
1825 for(i=0; i<chromWidth; i++) | 1825 for(i=0; i<chromWidth; i++) |
1826 { | 1826 { |
1875 | 1875 |
1876 dst+= dstStride; | 1876 dst+= dstStride; |
1877 | 1877 |
1878 for(y=1; y<srcHeight; y++){ | 1878 for(y=1; y<srcHeight; y++){ |
1879 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1879 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1880 const int mmxSize= srcWidth&~15; | 1880 const long mmxSize= srcWidth&~15; |
1881 asm volatile( | 1881 asm volatile( |
1882 "movl %4, %%eax \n\t" | 1882 "mov %4, %%"REG_a" \n\t" |
1883 "1: \n\t" | 1883 "1: \n\t" |
1884 "movq (%0, %%eax), %%mm0 \n\t" | 1884 "movq (%0, %%"REG_a"), %%mm0 \n\t" |
1885 "movq (%1, %%eax), %%mm1 \n\t" | 1885 "movq (%1, %%"REG_a"), %%mm1 \n\t" |
1886 "movq 1(%0, %%eax), %%mm2 \n\t" | 1886 "movq 1(%0, %%"REG_a"), %%mm2 \n\t" |
1887 "movq 1(%1, %%eax), %%mm3 \n\t" | 1887 "movq 1(%1, %%"REG_a"), %%mm3 \n\t" |
1888 "movq -1(%0, %%eax), %%mm4 \n\t" | 1888 "movq -1(%0, %%"REG_a"), %%mm4 \n\t" |
1889 "movq -1(%1, %%eax), %%mm5 \n\t" | 1889 "movq -1(%1, %%"REG_a"), %%mm5 \n\t" |
1890 PAVGB" %%mm0, %%mm5 \n\t" | 1890 PAVGB" %%mm0, %%mm5 \n\t" |
1891 PAVGB" %%mm0, %%mm3 \n\t" | 1891 PAVGB" %%mm0, %%mm3 \n\t" |
1892 PAVGB" %%mm0, %%mm5 \n\t" | 1892 PAVGB" %%mm0, %%mm5 \n\t" |
1893 PAVGB" %%mm0, %%mm3 \n\t" | 1893 PAVGB" %%mm0, %%mm3 \n\t" |
1894 PAVGB" %%mm1, %%mm4 \n\t" | 1894 PAVGB" %%mm1, %%mm4 \n\t" |
1900 "punpcklbw %%mm3, %%mm5 \n\t" | 1900 "punpcklbw %%mm3, %%mm5 \n\t" |
1901 "punpckhbw %%mm3, %%mm7 \n\t" | 1901 "punpckhbw %%mm3, %%mm7 \n\t" |
1902 "punpcklbw %%mm2, %%mm4 \n\t" | 1902 "punpcklbw %%mm2, %%mm4 \n\t" |
1903 "punpckhbw %%mm2, %%mm6 \n\t" | 1903 "punpckhbw %%mm2, %%mm6 \n\t" |
1904 #if 1 | 1904 #if 1 |
1905 MOVNTQ" %%mm5, (%2, %%eax, 2) \n\t" | 1905 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t" |
1906 MOVNTQ" %%mm7, 8(%2, %%eax, 2) \n\t" | 1906 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t" |
1907 MOVNTQ" %%mm4, (%3, %%eax, 2) \n\t" | 1907 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t" |
1908 MOVNTQ" %%mm6, 8(%3, %%eax, 2) \n\t" | 1908 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t" |
1909 #else | 1909 #else |
1910 "movq %%mm5, (%2, %%eax, 2) \n\t" | 1910 "movq %%mm5, (%2, %%"REG_a", 2) \n\t" |
1911 "movq %%mm7, 8(%2, %%eax, 2) \n\t" | 1911 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t" |
1912 "movq %%mm4, (%3, %%eax, 2) \n\t" | 1912 "movq %%mm4, (%3, %%"REG_a", 2) \n\t" |
1913 "movq %%mm6, 8(%3, %%eax, 2) \n\t" | 1913 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t" |
1914 #endif | 1914 #endif |
1915 "addl $8, %%eax \n\t" | 1915 "add $8, %%"REG_a" \n\t" |
1916 " js 1b \n\t" | 1916 " js 1b \n\t" |
1917 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), | 1917 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), |
1918 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), | 1918 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), |
1919 "g" (-mmxSize) | 1919 "g" (-mmxSize) |
1920 : "%eax" | 1920 : "%"REG_a |
1921 | 1921 |
1922 ); | 1922 ); |
1923 #else | 1923 #else |
1924 const int mmxSize=1; | 1924 const int mmxSize=1; |
1925 #endif | 1925 #endif |
2105 { | 2105 { |
2106 unsigned i; | 2106 unsigned i; |
2107 for(i=0; i<2; i++) | 2107 for(i=0; i<2; i++) |
2108 { | 2108 { |
2109 asm volatile( | 2109 asm volatile( |
2110 "movl %2, %%eax \n\t" | 2110 "mov %2, %%"REG_a" \n\t" |
2111 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" | 2111 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" |
2112 "movq "MANGLE(w1111)", %%mm5 \n\t" | 2112 "movq "MANGLE(w1111)", %%mm5 \n\t" |
2113 "pxor %%mm7, %%mm7 \n\t" | 2113 "pxor %%mm7, %%mm7 \n\t" |
2114 "leal (%%eax, %%eax, 2), %%ebx \n\t" | 2114 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" |
2115 ".balign 16 \n\t" | 2115 ".balign 16 \n\t" |
2116 "1: \n\t" | 2116 "1: \n\t" |
2117 PREFETCH" 64(%0, %%ebx) \n\t" | 2117 PREFETCH" 64(%0, %%"REG_b") \n\t" |
2118 "movd (%0, %%ebx), %%mm0 \n\t" | 2118 "movd (%0, %%"REG_b"), %%mm0 \n\t" |
2119 "movd 3(%0, %%ebx), %%mm1 \n\t" | 2119 "movd 3(%0, %%"REG_b"), %%mm1 \n\t" |
2120 "punpcklbw %%mm7, %%mm0 \n\t" | 2120 "punpcklbw %%mm7, %%mm0 \n\t" |
2121 "punpcklbw %%mm7, %%mm1 \n\t" | 2121 "punpcklbw %%mm7, %%mm1 \n\t" |
2122 "movd 6(%0, %%ebx), %%mm2 \n\t" | 2122 "movd 6(%0, %%"REG_b"), %%mm2 \n\t" |
2123 "movd 9(%0, %%ebx), %%mm3 \n\t" | 2123 "movd 9(%0, %%"REG_b"), %%mm3 \n\t" |
2124 "punpcklbw %%mm7, %%mm2 \n\t" | 2124 "punpcklbw %%mm7, %%mm2 \n\t" |
2125 "punpcklbw %%mm7, %%mm3 \n\t" | 2125 "punpcklbw %%mm7, %%mm3 \n\t" |
2126 "pmaddwd %%mm6, %%mm0 \n\t" | 2126 "pmaddwd %%mm6, %%mm0 \n\t" |
2127 "pmaddwd %%mm6, %%mm1 \n\t" | 2127 "pmaddwd %%mm6, %%mm1 \n\t" |
2128 "pmaddwd %%mm6, %%mm2 \n\t" | 2128 "pmaddwd %%mm6, %%mm2 \n\t" |
2138 "pmaddwd %%mm5, %%mm0 \n\t" | 2138 "pmaddwd %%mm5, %%mm0 \n\t" |
2139 "pmaddwd %%mm5, %%mm2 \n\t" | 2139 "pmaddwd %%mm5, %%mm2 \n\t" |
2140 "packssdw %%mm2, %%mm0 \n\t" | 2140 "packssdw %%mm2, %%mm0 \n\t" |
2141 "psraw $7, %%mm0 \n\t" | 2141 "psraw $7, %%mm0 \n\t" |
2142 | 2142 |
2143 "movd 12(%0, %%ebx), %%mm4 \n\t" | 2143 "movd 12(%0, %%"REG_b"), %%mm4 \n\t" |
2144 "movd 15(%0, %%ebx), %%mm1 \n\t" | 2144 "movd 15(%0, %%"REG_b"), %%mm1 \n\t" |
2145 "punpcklbw %%mm7, %%mm4 \n\t" | 2145 "punpcklbw %%mm7, %%mm4 \n\t" |
2146 "punpcklbw %%mm7, %%mm1 \n\t" | 2146 "punpcklbw %%mm7, %%mm1 \n\t" |
2147 "movd 18(%0, %%ebx), %%mm2 \n\t" | 2147 "movd 18(%0, %%"REG_b"), %%mm2 \n\t" |
2148 "movd 21(%0, %%ebx), %%mm3 \n\t" | 2148 "movd 21(%0, %%"REG_b"), %%mm3 \n\t" |
2149 "punpcklbw %%mm7, %%mm2 \n\t" | 2149 "punpcklbw %%mm7, %%mm2 \n\t" |
2150 "punpcklbw %%mm7, %%mm3 \n\t" | 2150 "punpcklbw %%mm7, %%mm3 \n\t" |
2151 "pmaddwd %%mm6, %%mm4 \n\t" | 2151 "pmaddwd %%mm6, %%mm4 \n\t" |
2152 "pmaddwd %%mm6, %%mm1 \n\t" | 2152 "pmaddwd %%mm6, %%mm1 \n\t" |
2153 "pmaddwd %%mm6, %%mm2 \n\t" | 2153 "pmaddwd %%mm6, %%mm2 \n\t" |
2160 #endif | 2160 #endif |
2161 "packssdw %%mm1, %%mm4 \n\t" | 2161 "packssdw %%mm1, %%mm4 \n\t" |
2162 "packssdw %%mm3, %%mm2 \n\t" | 2162 "packssdw %%mm3, %%mm2 \n\t" |
2163 "pmaddwd %%mm5, %%mm4 \n\t" | 2163 "pmaddwd %%mm5, %%mm4 \n\t" |
2164 "pmaddwd %%mm5, %%mm2 \n\t" | 2164 "pmaddwd %%mm5, %%mm2 \n\t" |
2165 "addl $24, %%ebx \n\t" | 2165 "add $24, %%"REG_b" \n\t" |
2166 "packssdw %%mm2, %%mm4 \n\t" | 2166 "packssdw %%mm2, %%mm4 \n\t" |
2167 "psraw $7, %%mm4 \n\t" | 2167 "psraw $7, %%mm4 \n\t" |
2168 | 2168 |
2169 "packuswb %%mm4, %%mm0 \n\t" | 2169 "packuswb %%mm4, %%mm0 \n\t" |
2170 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" | 2170 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" |
2171 | 2171 |
2172 MOVNTQ" %%mm0, (%1, %%eax) \n\t" | 2172 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" |
2173 "addl $8, %%eax \n\t" | 2173 "add $8, %%"REG_a" \n\t" |
2174 " js 1b \n\t" | 2174 " js 1b \n\t" |
2175 : : "r" (src+width*3), "r" (ydst+width), "g" (-width) | 2175 : : "r" (src+width*3), "r" (ydst+width), "g" ((long)-width) |
2176 : "%eax", "%ebx" | 2176 : "%"REG_a, "%"REG_b |
2177 ); | 2177 ); |
2178 ydst += lumStride; | 2178 ydst += lumStride; |
2179 src += srcStride; | 2179 src += srcStride; |
2180 } | 2180 } |
2181 src -= srcStride*2; | 2181 src -= srcStride*2; |
2182 asm volatile( | 2182 asm volatile( |
2183 "movl %4, %%eax \n\t" | 2183 "mov %4, %%"REG_a" \n\t" |
2184 "movq "MANGLE(w1111)", %%mm5 \n\t" | 2184 "movq "MANGLE(w1111)", %%mm5 \n\t" |
2185 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" | 2185 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" |
2186 "pxor %%mm7, %%mm7 \n\t" | 2186 "pxor %%mm7, %%mm7 \n\t" |
2187 "leal (%%eax, %%eax, 2), %%ebx \n\t" | 2187 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" |
2188 "addl %%ebx, %%ebx \n\t" | 2188 "add %%"REG_b", %%"REG_b" \n\t" |
2189 ".balign 16 \n\t" | 2189 ".balign 16 \n\t" |
2190 "1: \n\t" | 2190 "1: \n\t" |
2191 PREFETCH" 64(%0, %%ebx) \n\t" | 2191 PREFETCH" 64(%0, %%"REG_b") \n\t" |
2192 PREFETCH" 64(%1, %%ebx) \n\t" | 2192 PREFETCH" 64(%1, %%"REG_b") \n\t" |
2193 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 2193 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2194 "movq (%0, %%ebx), %%mm0 \n\t" | 2194 "movq (%0, %%"REG_b"), %%mm0 \n\t" |
2195 "movq (%1, %%ebx), %%mm1 \n\t" | 2195 "movq (%1, %%"REG_b"), %%mm1 \n\t" |
2196 "movq 6(%0, %%ebx), %%mm2 \n\t" | 2196 "movq 6(%0, %%"REG_b"), %%mm2 \n\t" |
2197 "movq 6(%1, %%ebx), %%mm3 \n\t" | 2197 "movq 6(%1, %%"REG_b"), %%mm3 \n\t" |
2198 PAVGB" %%mm1, %%mm0 \n\t" | 2198 PAVGB" %%mm1, %%mm0 \n\t" |
2199 PAVGB" %%mm3, %%mm2 \n\t" | 2199 PAVGB" %%mm3, %%mm2 \n\t" |
2200 "movq %%mm0, %%mm1 \n\t" | 2200 "movq %%mm0, %%mm1 \n\t" |
2201 "movq %%mm2, %%mm3 \n\t" | 2201 "movq %%mm2, %%mm3 \n\t" |
2202 "psrlq $24, %%mm0 \n\t" | 2202 "psrlq $24, %%mm0 \n\t" |
2204 PAVGB" %%mm1, %%mm0 \n\t" | 2204 PAVGB" %%mm1, %%mm0 \n\t" |
2205 PAVGB" %%mm3, %%mm2 \n\t" | 2205 PAVGB" %%mm3, %%mm2 \n\t" |
2206 "punpcklbw %%mm7, %%mm0 \n\t" | 2206 "punpcklbw %%mm7, %%mm0 \n\t" |
2207 "punpcklbw %%mm7, %%mm2 \n\t" | 2207 "punpcklbw %%mm7, %%mm2 \n\t" |
2208 #else | 2208 #else |
2209 "movd (%0, %%ebx), %%mm0 \n\t" | 2209 "movd (%0, %%"REG_b"), %%mm0 \n\t" |
2210 "movd (%1, %%ebx), %%mm1 \n\t" | 2210 "movd (%1, %%"REG_b"), %%mm1 \n\t" |
2211 "movd 3(%0, %%ebx), %%mm2 \n\t" | 2211 "movd 3(%0, %%"REG_b"), %%mm2 \n\t" |
2212 "movd 3(%1, %%ebx), %%mm3 \n\t" | 2212 "movd 3(%1, %%"REG_b"), %%mm3 \n\t" |
2213 "punpcklbw %%mm7, %%mm0 \n\t" | 2213 "punpcklbw %%mm7, %%mm0 \n\t" |
2214 "punpcklbw %%mm7, %%mm1 \n\t" | 2214 "punpcklbw %%mm7, %%mm1 \n\t" |
2215 "punpcklbw %%mm7, %%mm2 \n\t" | 2215 "punpcklbw %%mm7, %%mm2 \n\t" |
2216 "punpcklbw %%mm7, %%mm3 \n\t" | 2216 "punpcklbw %%mm7, %%mm3 \n\t" |
2217 "paddw %%mm1, %%mm0 \n\t" | 2217 "paddw %%mm1, %%mm0 \n\t" |
2218 "paddw %%mm3, %%mm2 \n\t" | 2218 "paddw %%mm3, %%mm2 \n\t" |
2219 "paddw %%mm2, %%mm0 \n\t" | 2219 "paddw %%mm2, %%mm0 \n\t" |
2220 "movd 6(%0, %%ebx), %%mm4 \n\t" | 2220 "movd 6(%0, %%"REG_b"), %%mm4 \n\t" |
2221 "movd 6(%1, %%ebx), %%mm1 \n\t" | 2221 "movd 6(%1, %%"REG_b"), %%mm1 \n\t" |
2222 "movd 9(%0, %%ebx), %%mm2 \n\t" | 2222 "movd 9(%0, %%"REG_b"), %%mm2 \n\t" |
2223 "movd 9(%1, %%ebx), %%mm3 \n\t" | 2223 "movd 9(%1, %%"REG_b"), %%mm3 \n\t" |
2224 "punpcklbw %%mm7, %%mm4 \n\t" | 2224 "punpcklbw %%mm7, %%mm4 \n\t" |
2225 "punpcklbw %%mm7, %%mm1 \n\t" | 2225 "punpcklbw %%mm7, %%mm1 \n\t" |
2226 "punpcklbw %%mm7, %%mm2 \n\t" | 2226 "punpcklbw %%mm7, %%mm2 \n\t" |
2227 "punpcklbw %%mm7, %%mm3 \n\t" | 2227 "punpcklbw %%mm7, %%mm3 \n\t" |
2228 "paddw %%mm1, %%mm4 \n\t" | 2228 "paddw %%mm1, %%mm4 \n\t" |
2250 "pmaddwd %%mm5, %%mm1 \n\t" | 2250 "pmaddwd %%mm5, %%mm1 \n\t" |
2251 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | 2251 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 |
2252 "psraw $7, %%mm0 \n\t" | 2252 "psraw $7, %%mm0 \n\t" |
2253 | 2253 |
2254 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 2254 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2255 "movq 12(%0, %%ebx), %%mm4 \n\t" | 2255 "movq 12(%0, %%"REG_b"), %%mm4 \n\t" |
2256 "movq 12(%1, %%ebx), %%mm1 \n\t" | 2256 "movq 12(%1, %%"REG_b"), %%mm1 \n\t" |
2257 "movq 18(%0, %%ebx), %%mm2 \n\t" | 2257 "movq 18(%0, %%"REG_b"), %%mm2 \n\t" |
2258 "movq 18(%1, %%ebx), %%mm3 \n\t" | 2258 "movq 18(%1, %%"REG_b"), %%mm3 \n\t" |
2259 PAVGB" %%mm1, %%mm4 \n\t" | 2259 PAVGB" %%mm1, %%mm4 \n\t" |
2260 PAVGB" %%mm3, %%mm2 \n\t" | 2260 PAVGB" %%mm3, %%mm2 \n\t" |
2261 "movq %%mm4, %%mm1 \n\t" | 2261 "movq %%mm4, %%mm1 \n\t" |
2262 "movq %%mm2, %%mm3 \n\t" | 2262 "movq %%mm2, %%mm3 \n\t" |
2263 "psrlq $24, %%mm4 \n\t" | 2263 "psrlq $24, %%mm4 \n\t" |
2265 PAVGB" %%mm1, %%mm4 \n\t" | 2265 PAVGB" %%mm1, %%mm4 \n\t" |
2266 PAVGB" %%mm3, %%mm2 \n\t" | 2266 PAVGB" %%mm3, %%mm2 \n\t" |
2267 "punpcklbw %%mm7, %%mm4 \n\t" | 2267 "punpcklbw %%mm7, %%mm4 \n\t" |
2268 "punpcklbw %%mm7, %%mm2 \n\t" | 2268 "punpcklbw %%mm7, %%mm2 \n\t" |
2269 #else | 2269 #else |
2270 "movd 12(%0, %%ebx), %%mm4 \n\t" | 2270 "movd 12(%0, %%"REG_b"), %%mm4 \n\t" |
2271 "movd 12(%1, %%ebx), %%mm1 \n\t" | 2271 "movd 12(%1, %%"REG_b"), %%mm1 \n\t" |
2272 "movd 15(%0, %%ebx), %%mm2 \n\t" | 2272 "movd 15(%0, %%"REG_b"), %%mm2 \n\t" |
2273 "movd 15(%1, %%ebx), %%mm3 \n\t" | 2273 "movd 15(%1, %%"REG_b"), %%mm3 \n\t" |
2274 "punpcklbw %%mm7, %%mm4 \n\t" | 2274 "punpcklbw %%mm7, %%mm4 \n\t" |
2275 "punpcklbw %%mm7, %%mm1 \n\t" | 2275 "punpcklbw %%mm7, %%mm1 \n\t" |
2276 "punpcklbw %%mm7, %%mm2 \n\t" | 2276 "punpcklbw %%mm7, %%mm2 \n\t" |
2277 "punpcklbw %%mm7, %%mm3 \n\t" | 2277 "punpcklbw %%mm7, %%mm3 \n\t" |
2278 "paddw %%mm1, %%mm4 \n\t" | 2278 "paddw %%mm1, %%mm4 \n\t" |
2279 "paddw %%mm3, %%mm2 \n\t" | 2279 "paddw %%mm3, %%mm2 \n\t" |
2280 "paddw %%mm2, %%mm4 \n\t" | 2280 "paddw %%mm2, %%mm4 \n\t" |
2281 "movd 18(%0, %%ebx), %%mm5 \n\t" | 2281 "movd 18(%0, %%"REG_b"), %%mm5 \n\t" |
2282 "movd 18(%1, %%ebx), %%mm1 \n\t" | 2282 "movd 18(%1, %%"REG_b"), %%mm1 \n\t" |
2283 "movd 21(%0, %%ebx), %%mm2 \n\t" | 2283 "movd 21(%0, %%"REG_b"), %%mm2 \n\t" |
2284 "movd 21(%1, %%ebx), %%mm3 \n\t" | 2284 "movd 21(%1, %%"REG_b"), %%mm3 \n\t" |
2285 "punpcklbw %%mm7, %%mm5 \n\t" | 2285 "punpcklbw %%mm7, %%mm5 \n\t" |
2286 "punpcklbw %%mm7, %%mm1 \n\t" | 2286 "punpcklbw %%mm7, %%mm1 \n\t" |
2287 "punpcklbw %%mm7, %%mm2 \n\t" | 2287 "punpcklbw %%mm7, %%mm2 \n\t" |
2288 "punpcklbw %%mm7, %%mm3 \n\t" | 2288 "punpcklbw %%mm7, %%mm3 \n\t" |
2289 "paddw %%mm1, %%mm5 \n\t" | 2289 "paddw %%mm1, %%mm5 \n\t" |
2308 #endif | 2308 #endif |
2309 "packssdw %%mm2, %%mm4 \n\t" | 2309 "packssdw %%mm2, %%mm4 \n\t" |
2310 "packssdw %%mm3, %%mm1 \n\t" | 2310 "packssdw %%mm3, %%mm1 \n\t" |
2311 "pmaddwd %%mm5, %%mm4 \n\t" | 2311 "pmaddwd %%mm5, %%mm4 \n\t" |
2312 "pmaddwd %%mm5, %%mm1 \n\t" | 2312 "pmaddwd %%mm5, %%mm1 \n\t" |
2313 "addl $24, %%ebx \n\t" | 2313 "add $24, %%"REG_b" \n\t" |
2314 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 | 2314 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 |
2315 "psraw $7, %%mm4 \n\t" | 2315 "psraw $7, %%mm4 \n\t" |
2316 | 2316 |
2317 "movq %%mm0, %%mm1 \n\t" | 2317 "movq %%mm0, %%mm1 \n\t" |
2318 "punpckldq %%mm4, %%mm0 \n\t" | 2318 "punpckldq %%mm4, %%mm0 \n\t" |
2319 "punpckhdq %%mm4, %%mm1 \n\t" | 2319 "punpckhdq %%mm4, %%mm1 \n\t" |
2320 "packsswb %%mm1, %%mm0 \n\t" | 2320 "packsswb %%mm1, %%mm0 \n\t" |
2321 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" | 2321 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" |
2322 | 2322 "movd %%mm0, (%2, %%"REG_a") \n\t" |
2323 "movd %%mm0, (%2, %%eax) \n\t" | |
2324 "punpckhdq %%mm0, %%mm0 \n\t" | 2323 "punpckhdq %%mm0, %%mm0 \n\t" |
2325 "movd %%mm0, (%3, %%eax) \n\t" | 2324 "movd %%mm0, (%3, %%"REG_a") \n\t" |
2326 "addl $4, %%eax \n\t" | 2325 "add $4, %%"REG_a" \n\t" |
2327 " js 1b \n\t" | 2326 " js 1b \n\t" |
2328 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) | 2327 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" ((long)-chromWidth) |
2329 : "%eax", "%ebx" | 2328 : "%"REG_a, "%"REG_b |
2330 ); | 2329 ); |
2331 | 2330 |
2332 udst += chromStride; | 2331 udst += chromStride; |
2333 vdst += chromStride; | 2332 vdst += chromStride; |
2334 src += srcStride*2; | 2333 src += srcStride*2; |
2401 unsigned w; | 2400 unsigned w; |
2402 | 2401 |
2403 #ifdef HAVE_MMX | 2402 #ifdef HAVE_MMX |
2404 #ifdef HAVE_SSE2 | 2403 #ifdef HAVE_SSE2 |
2405 asm( | 2404 asm( |
2406 "xorl %%eax, %%eax \n\t" | 2405 "xor %%"REG_a", %%"REG_a" \n\t" |
2407 "1: \n\t" | 2406 "1: \n\t" |
2408 PREFETCH" 64(%1, %%eax) \n\t" | 2407 PREFETCH" 64(%1, %%"REG_a") \n\t" |
2409 PREFETCH" 64(%2, %%eax) \n\t" | 2408 PREFETCH" 64(%2, %%"REG_a") \n\t" |
2410 "movdqa (%1, %%eax), %%xmm0 \n\t" | 2409 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" |
2411 "movdqa (%1, %%eax), %%xmm1 \n\t" | 2410 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" |
2412 "movdqa (%2, %%eax), %%xmm2 \n\t" | 2411 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" |
2413 "punpcklbw %%xmm2, %%xmm0 \n\t" | 2412 "punpcklbw %%xmm2, %%xmm0 \n\t" |
2414 "punpckhbw %%xmm2, %%xmm1 \n\t" | 2413 "punpckhbw %%xmm2, %%xmm1 \n\t" |
2415 "movntdq %%xmm0, (%0, %%eax, 2) \n\t" | 2414 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t" |
2416 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t" | 2415 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t" |
2417 "addl $16, %%eax \n\t" | 2416 "add $16, %%"REG_a" \n\t" |
2418 "cmpl %3, %%eax \n\t" | 2417 "cmp %3, %%"REG_a" \n\t" |
2419 " jb 1b \n\t" | 2418 " jb 1b \n\t" |
2420 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) | 2419 ::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15) |
2421 : "memory", "%eax" | 2420 : "memory", "%"REG_a"" |
2422 ); | 2421 ); |
2423 #else | 2422 #else |
2424 asm( | 2423 asm( |
2425 "xorl %%eax, %%eax \n\t" | 2424 "xor %%"REG_a", %%"REG_a" \n\t" |
2426 "1: \n\t" | 2425 "1: \n\t" |
2427 PREFETCH" 64(%1, %%eax) \n\t" | 2426 PREFETCH" 64(%1, %%"REG_a") \n\t" |
2428 PREFETCH" 64(%2, %%eax) \n\t" | 2427 PREFETCH" 64(%2, %%"REG_a") \n\t" |
2429 "movq (%1, %%eax), %%mm0 \n\t" | 2428 "movq (%1, %%"REG_a"), %%mm0 \n\t" |
2430 "movq 8(%1, %%eax), %%mm2 \n\t" | 2429 "movq 8(%1, %%"REG_a"), %%mm2 \n\t" |
2431 "movq %%mm0, %%mm1 \n\t" | 2430 "movq %%mm0, %%mm1 \n\t" |
2432 "movq %%mm2, %%mm3 \n\t" | 2431 "movq %%mm2, %%mm3 \n\t" |
2433 "movq (%2, %%eax), %%mm4 \n\t" | 2432 "movq (%2, %%"REG_a"), %%mm4 \n\t" |
2434 "movq 8(%2, %%eax), %%mm5 \n\t" | 2433 "movq 8(%2, %%"REG_a"), %%mm5 \n\t" |
2435 "punpcklbw %%mm4, %%mm0 \n\t" | 2434 "punpcklbw %%mm4, %%mm0 \n\t" |
2436 "punpckhbw %%mm4, %%mm1 \n\t" | 2435 "punpckhbw %%mm4, %%mm1 \n\t" |
2437 "punpcklbw %%mm5, %%mm2 \n\t" | 2436 "punpcklbw %%mm5, %%mm2 \n\t" |
2438 "punpckhbw %%mm5, %%mm3 \n\t" | 2437 "punpckhbw %%mm5, %%mm3 \n\t" |
2439 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t" | 2438 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t" |
2440 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t" | 2439 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t" |
2441 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t" | 2440 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t" |
2442 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t" | 2441 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t" |
2443 "addl $16, %%eax \n\t" | 2442 "add $16, %%"REG_a" \n\t" |
2444 "cmpl %3, %%eax \n\t" | 2443 "cmp %3, %%"REG_a" \n\t" |
2445 " jb 1b \n\t" | 2444 " jb 1b \n\t" |
2446 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) | 2445 ::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15) |
2447 : "memory", "%eax" | 2446 : "memory", "%"REG_a |
2448 ); | 2447 ); |
2449 #endif | 2448 #endif |
2450 for(w= (width&(~15)); w < width; w++) | 2449 for(w= (width&(~15)); w < width; w++) |
2451 { | 2450 { |
2452 dest[2*w+0] = src1[w]; | 2451 dest[2*w+0] = src1[w]; |
2580 uint8_t *dst, | 2579 uint8_t *dst, |
2581 unsigned width, unsigned height, | 2580 unsigned width, unsigned height, |
2582 int srcStride1, int srcStride2, | 2581 int srcStride1, int srcStride2, |
2583 int srcStride3, int dstStride) | 2582 int srcStride3, int dstStride) |
2584 { | 2583 { |
2585 unsigned y,x,w,h; | 2584 unsigned long y,x,w,h; |
2586 w=width/2; h=height; | 2585 w=width/2; h=height; |
2587 for(y=0;y<h;y++){ | 2586 for(y=0;y<h;y++){ |
2588 const uint8_t* yp=src1+srcStride1*y; | 2587 const uint8_t* yp=src1+srcStride1*y; |
2589 const uint8_t* up=src2+srcStride2*(y>>2); | 2588 const uint8_t* up=src2+srcStride2*(y>>2); |
2590 const uint8_t* vp=src3+srcStride3*(y>>2); | 2589 const uint8_t* vp=src3+srcStride3*(y>>2); |