comparison i386/dsputil_mmx.c @ 1092:f59c3f66363b libavcodec

MpegEncContext.(i)dct_* -> DspContext.(i)dct_* bitexact cleanup
author michaelni
date Mon, 03 Mar 2003 14:54:00 +0000
parents 92fb44eae6b6
children 74a46d77e061
comparison
equal deleted inserted replaced
1091:03df246fb06b 1092:f59c3f66363b
18 * 18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> 19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */ 20 */
21 21
22 #include "../dsputil.h" 22 #include "../dsputil.h"
23 #include "../simple_idct.h"
23 24
24 int mm_flags; /* multimedia extension flags */ 25 int mm_flags; /* multimedia extension flags */
25 26
26 /* pixel operations */ 27 /* pixel operations */
27 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; 28 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
1406 1407
1407 #define SET_QPEL_FUNC(postfix1, postfix2) \ 1408 #define SET_QPEL_FUNC(postfix1, postfix2) \
1408 c->put_ ## postfix1 = put_ ## postfix2;\ 1409 c->put_ ## postfix1 = put_ ## postfix2;\
1409 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\ 1410 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
1410 c->avg_ ## postfix1 = avg_ ## postfix2; 1411 c->avg_ ## postfix1 = avg_ ## postfix2;
1412
1413 /* external functions, from idct_mmx.c */
1414 void ff_mmx_idct(DCTELEM *block);
1415 void ff_mmxext_idct(DCTELEM *block);
1416
1417 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1418 converted */
1419 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1420 {
1421 ff_mmx_idct (block);
1422 put_pixels_clamped_mmx(block, dest, line_size);
1423 }
1424 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1425 {
1426 ff_mmx_idct (block);
1427 add_pixels_clamped_mmx(block, dest, line_size);
1428 }
1429 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1430 {
1431 ff_mmxext_idct (block);
1432 put_pixels_clamped_mmx(block, dest, line_size);
1433 }
1434 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1435 {
1436 ff_mmxext_idct (block);
1437 add_pixels_clamped_mmx(block, dest, line_size);
1438 }
1411 1439
1412 void dsputil_init_mmx(DSPContext* c, unsigned mask) 1440 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
1413 { 1441 {
1414 mm_flags = mm_support(); 1442 mm_flags = mm_support();
1415 #if 0 1443 #if 0
1416 fprintf(stderr, "libavcodec: CPU flags:"); 1444 fprintf(stderr, "libavcodec: CPU flags:");
1417 if (mm_flags & MM_MMX) 1445 if (mm_flags & MM_MMX)
1426 fprintf(stderr, " sse2"); 1454 fprintf(stderr, " sse2");
1427 fprintf(stderr, "\n"); 1455 fprintf(stderr, "\n");
1428 #endif 1456 #endif
1429 1457
1430 if (mm_flags & MM_MMX) { 1458 if (mm_flags & MM_MMX) {
1459 const int dct_algo = avctx->dct_algo;
1460 const int idct_algo= avctx->idct_algo;
1461
1462 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)
1463 c->fdct = ff_fdct_mmx;
1464
1465 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
1466 c->idct_put= ff_simple_idct_put_mmx;
1467 c->idct_add= ff_simple_idct_add_mmx;
1468 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
1469 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
1470 if(mm_flags & MM_MMXEXT){
1471 c->idct_put= ff_libmpeg2mmx2_idct_put;
1472 c->idct_add= ff_libmpeg2mmx2_idct_add;
1473 }else{
1474 c->idct_put= ff_libmpeg2mmx_idct_put;
1475 c->idct_add= ff_libmpeg2mmx_idct_add;
1476 }
1477 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
1478 }
1479
1431 c->get_pixels = get_pixels_mmx; 1480 c->get_pixels = get_pixels_mmx;
1432 c->diff_pixels = diff_pixels_mmx; 1481 c->diff_pixels = diff_pixels_mmx;
1433 c->put_pixels_clamped = put_pixels_clamped_mmx; 1482 c->put_pixels_clamped = put_pixels_clamped_mmx;
1434 c->add_pixels_clamped = add_pixels_clamped_mmx; 1483 c->add_pixels_clamped = add_pixels_clamped_mmx;
1435 c->clear_blocks = clear_blocks_mmx; 1484 c->clear_blocks = clear_blocks_mmx;
1485 c->sse[0] = sse16_mmx; 1534 c->sse[0] = sse16_mmx;
1486 1535
1487 if (mm_flags & MM_MMXEXT) { 1536 if (mm_flags & MM_MMXEXT) {
1488 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; 1537 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
1489 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; 1538 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
1490 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
1491 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
1492 1539
1493 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2; 1540 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
1494 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; 1541 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
1495 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; 1542 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
1496 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
1497 1543
1498 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2; 1544 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
1499 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2; 1545 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
1500 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
1501 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
1502 1546
1503 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; 1547 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
1504 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; 1548 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
1505 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; 1549 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1506 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; 1550
1551 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1552 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
1553 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
1554 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
1555 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
1556 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
1557 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
1558 }
1507 1559
1508 #if 1 1560 #if 1
1509 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2) 1561 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
1510 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2) 1562 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
1511 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2) 1563 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
1540 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2) 1592 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
1541 #endif 1593 #endif
1542 } else if (mm_flags & MM_3DNOW) { 1594 } else if (mm_flags & MM_3DNOW) {
1543 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; 1595 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
1544 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; 1596 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
1545 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
1546 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
1547 1597
1548 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; 1598 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
1549 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; 1599 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
1550 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; 1600 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
1551 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
1552 1601
1553 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow; 1602 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
1554 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow; 1603 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
1555 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
1556 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
1557 1604
1558 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow; 1605 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
1559 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; 1606 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
1560 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; 1607 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1561 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; 1608
1609 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1610 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
1611 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
1612 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
1613 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
1614 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
1615 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
1616 }
1562 1617
1563 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow) 1618 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
1564 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow) 1619 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
1565 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow) 1620 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
1566 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow) 1621 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
1592 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow) 1647 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
1593 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow) 1648 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
1594 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow) 1649 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
1595 } 1650 }
1596 } 1651 }
1597 dsputil_init_pix_mmx(c, mask); 1652
1653 dsputil_init_pix_mmx(c, avctx);
1598 #if 0 1654 #if 0
1599 // for speed testing 1655 // for speed testing
1600 get_pixels = just_return; 1656 get_pixels = just_return;
1601 put_pixels_clamped = just_return; 1657 put_pixels_clamped = just_return;
1602 add_pixels_clamped = just_return; 1658 add_pixels_clamped = just_return;
1628 1684
1629 //av_fdct = just_return; 1685 //av_fdct = just_return;
1630 //ff_idct = just_return; 1686 //ff_idct = just_return;
1631 #endif 1687 #endif
1632 } 1688 }
1633
1634 /* remove any non bit exact operation (testing purpose). NOTE that
1635 this function should be kept as small as possible because it is
1636 always difficult to test automatically non bit exact cases. */
1637 void dsputil_set_bit_exact_mmx(DSPContext* c, unsigned mask)
1638 {
1639 if (mm_flags & MM_MMX) {
1640 /* MMX2 & 3DNOW */
1641 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
1642 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
1643 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
1644 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
1645 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
1646 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
1647 }
1648 dsputil_set_bit_exact_pix_mmx(c, mask);
1649 }