comparison vorbis.c @ 3568:945caa35ee9a libavcodec

sse and 3dnow implementations of float->int conversion and mdct windowing. 15% faster vorbis.
author lorenm
date Thu, 10 Aug 2006 19:06:25 +0000
parents 5ea82888103e
children 991ef6ade276
comparison
equal deleted inserted replaced
3567:1f8730f62765 3568:945caa35ee9a
190 av_free(vc->mappings[i].magnitude); 190 av_free(vc->mappings[i].magnitude);
191 av_free(vc->mappings[i].angle); 191 av_free(vc->mappings[i].angle);
192 av_free(vc->mappings[i].mux); 192 av_free(vc->mappings[i].mux);
193 } 193 }
194 av_freep(&vc->mappings); 194 av_freep(&vc->mappings);
195
196 if(vc->exp_bias){
197 av_freep(&vc->swin);
198 av_freep(&vc->lwin);
199 }
195 } 200 }
196 201
197 // Parse setup header ------------------------------------------------- 202 // Parse setup header -------------------------------------------------
198 203
199 // Process codebooks part 204 // Process codebooks part
886 return 4; 891 return 4;
887 } 892 }
888 vc->swin=vwin[bl0-6]; 893 vc->swin=vwin[bl0-6];
889 vc->lwin=vwin[bl1-6]; 894 vc->lwin=vwin[bl1-6];
890 895
896 if(vc->exp_bias){
897 int i;
898 float *win;
899 win = av_malloc(vc->blocksize_0/2 * sizeof(float));
900 for(i=0; i<vc->blocksize_0/2; i++)
901 win[i] = vc->swin[i] * (1<<15);
902 vc->swin = win;
903 win = av_malloc(vc->blocksize_1/2 * sizeof(float));
904 for(i=0; i<vc->blocksize_1/2; i++)
905 win[i] = vc->lwin[i] * (1<<15);
906 vc->lwin = win;
907 }
908
891 if ((get_bits1(gb)) == 0) { 909 if ((get_bits1(gb)) == 0) {
892 av_log(vc->avccontext, AV_LOG_ERROR, " Vorbis id header packet corrupt (framing flag not set). \n"); 910 av_log(vc->avccontext, AV_LOG_ERROR, " Vorbis id header packet corrupt (framing flag not set). \n");
893 return 2; 911 return 2;
894 } 912 }
895 913
928 GetBitContext *gb = &(vc->gb); 946 GetBitContext *gb = &(vc->gb);
929 int i, j, hdr_type; 947 int i, j, hdr_type;
930 948
931 vc->avccontext = avccontext; 949 vc->avccontext = avccontext;
932 dsputil_init(&vc->dsp, avccontext); 950 dsputil_init(&vc->dsp, avccontext);
951
952 if(vc->dsp.float_to_int16 == ff_float_to_int16_c) {
953 vc->add_bias = 385;
954 vc->exp_bias = 0;
955 } else {
956 vc->add_bias = 0;
957 vc->exp_bias = 15<<23;
958 }
933 959
934 if (!headers_len) { 960 if (!headers_len) {
935 av_log(avccontext, AV_LOG_ERROR, "Extradata corrupt.\n"); 961 av_log(avccontext, AV_LOG_ERROR, "Extradata corrupt.\n");
936 return -1; 962 return -1;
937 } 963 }
1470 } 1496 }
1471 } 1497 }
1472 } 1498 }
1473 1499
1474 // Decode the audio packet using the functions above 1500 // Decode the audio packet using the functions above
1475 #define BIAS 385
1476 1501
1477 static int vorbis_parse_audio_packet(vorbis_context *vc) { 1502 static int vorbis_parse_audio_packet(vorbis_context *vc) {
1478 GetBitContext *gb=&vc->gb; 1503 GetBitContext *gb=&vc->gb;
1479 1504
1480 uint_fast8_t previous_window=0,next_window=0; 1505 uint_fast8_t previous_window=0,next_window=0;
1488 float *ch_floor_ptr=vc->channel_floors; 1513 float *ch_floor_ptr=vc->channel_floors;
1489 uint_fast8_t res_chan[vc->audio_channels]; 1514 uint_fast8_t res_chan[vc->audio_channels];
1490 uint_fast8_t res_num=0; 1515 uint_fast8_t res_num=0;
1491 int_fast16_t retlen=0; 1516 int_fast16_t retlen=0;
1492 uint_fast16_t saved_start=0; 1517 uint_fast16_t saved_start=0;
1518 float fadd_bias = vc->add_bias;
1493 1519
1494 if (get_bits1(gb)) { 1520 if (get_bits1(gb)) {
1495 av_log(vc->avccontext, AV_LOG_ERROR, "Not a Vorbis I audio packet.\n"); 1521 av_log(vc->avccontext, AV_LOG_ERROR, "Not a Vorbis I audio packet.\n");
1496 return -1; // packet type not audio 1522 return -1; // packet type not audio
1497 } 1523 }
1574 1600
1575 // Dotproduct 1601 // Dotproduct
1576 1602
1577 for(j=0, ch_floor_ptr=vc->channel_floors;j<vc->audio_channels;++j,ch_floor_ptr+=blocksize/2) { 1603 for(j=0, ch_floor_ptr=vc->channel_floors;j<vc->audio_channels;++j,ch_floor_ptr+=blocksize/2) {
1578 ch_res_ptr=vc->channel_residues+res_chan[j]*blocksize/2; 1604 ch_res_ptr=vc->channel_residues+res_chan[j]*blocksize/2;
1579 1605 vc->dsp.vector_fmul(ch_floor_ptr, ch_res_ptr, blocksize/2);
1580 for(i=0;i<blocksize/2;++i) {
1581 ch_floor_ptr[i]*=ch_res_ptr[i]; //FPMATH
1582 }
1583 } 1606 }
1584 1607
1585 // MDCT, overlap/add, save data for next overlapping FPMATH 1608 // MDCT, overlap/add, save data for next overlapping FPMATH
1586 1609
1587 for(j=0;j<vc->audio_channels;++j) { 1610 for(j=0;j<vc->audio_channels;++j) {
1598 1621
1599 saved_start=vc->saved_start; 1622 saved_start=vc->saved_start;
1600 1623
1601 vc->mdct0.fft.imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp); 1624 vc->mdct0.fft.imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp);
1602 1625
1626 //FIXME process channels together, to allow faster simd vector_fmul_add_add?
1603 if (vc->modes[mode_number].blockflag) { 1627 if (vc->modes[mode_number].blockflag) {
1604 // -- overlap/add 1628 // -- overlap/add
1605 if (previous_window) { 1629 if (previous_window) {
1606 for(k=j, i=0;i<vc->blocksize_1/2;++i, k+=step) { 1630 vc->dsp.vector_fmul_add_add(ret+j, buf, lwin, saved, vc->add_bias, vc->blocksize_1/2, step);
1607 ret[k]=saved[i]+buf[i]*lwin[i]+BIAS;
1608 }
1609 retlen=vc->blocksize_1/2; 1631 retlen=vc->blocksize_1/2;
1610 } else { 1632 } else {
1611 buf += (vc->blocksize_1-vc->blocksize_0)/4; 1633 int len = (vc->blocksize_1-vc->blocksize_0)/4;
1612 for(k=j, i=0;i<vc->blocksize_0/2;++i, k+=step) { 1634 buf += len;
1613 ret[k]=saved[i]+buf[i]*swin[i]+BIAS; 1635 vc->dsp.vector_fmul_add_add(ret+j, buf, swin, saved, vc->add_bias, vc->blocksize_0/2, step);
1614 } 1636 k = vc->blocksize_0/2*step + j;
1615 buf += vc->blocksize_0/2; 1637 buf += vc->blocksize_0/2;
1616 for(i=0;i<(vc->blocksize_1-vc->blocksize_0)/4;++i, k+=step) { 1638 if(vc->exp_bias){
1617 ret[k]=buf[i]+BIAS; 1639 for(i=0; i<len; i++, k+=step)
1640 ((uint32_t*)ret)[k] = ((uint32_t*)buf)[i] + vc->exp_bias; // ret[k]=buf[i]*(1<<bias)
1641 } else {
1642 for(i=0; i<len; i++, k+=step)
1643 ret[k] = buf[i] + fadd_bias;
1618 } 1644 }
1619 buf=vc->buf; 1645 buf=vc->buf;
1620 retlen=vc->blocksize_0/2+(vc->blocksize_1-vc->blocksize_0)/4; 1646 retlen=vc->blocksize_0/2+len;
1621 } 1647 }
1622 // -- save 1648 // -- save
1623 if (next_window) { 1649 if (next_window) {
1624 buf += vc->blocksize_1/2; 1650 buf += vc->blocksize_1/2;
1625 lwin += vc->blocksize_1/2-1; 1651 vc->dsp.vector_fmul_reverse(saved, buf, lwin, vc->blocksize_1/2);
1626 for(i=0;i<vc->blocksize_1/2;++i) {
1627 saved[i]=buf[i]*lwin[-i];
1628 }
1629 saved_start=0; 1652 saved_start=0;
1630 } else { 1653 } else {
1631 saved_start=(vc->blocksize_1-vc->blocksize_0)/4; 1654 saved_start=(vc->blocksize_1-vc->blocksize_0)/4;
1632 buf += vc->blocksize_1/2; 1655 buf += vc->blocksize_1/2;
1633 for(i=0;i<saved_start;++i) { 1656 for(i=0; i<saved_start; i++)
1634 saved[i]=buf[i]; 1657 ((uint32_t*)saved)[i] = ((uint32_t*)buf)[i] + vc->exp_bias;
1635 } 1658 vc->dsp.vector_fmul_reverse(saved+saved_start, buf+saved_start, swin, vc->blocksize_0/2);
1636 swin += vc->blocksize_0/2-1;
1637 for(i=0;i<vc->blocksize_0/2;++i) {
1638 saved[saved_start+i]=buf[saved_start+i]*swin[-i];
1639 }
1640 } 1659 }
1641 } else { 1660 } else {
1642 // --overlap/add 1661 // --overlap/add
1643 for(k=j, i=0;i<saved_start;++i, k+=step) { 1662 if(vc->add_bias) {
1644 ret[k]=saved[i]+BIAS; 1663 for(k=j, i=0;i<saved_start;++i, k+=step)
1645 } 1664 ret[k] = saved[i] + fadd_bias;
1646 for(i=0;i<vc->blocksize_0/2;++i, k+=step) { 1665 } else {
1647 ret[k]=saved[saved_start+i]+buf[i]*swin[i]+BIAS; 1666 for(k=j, i=0;i<saved_start;++i, k+=step)
1648 } 1667 ret[k] = saved[i];
1668 }
1669 vc->dsp.vector_fmul_add_add(ret+k, buf, swin, saved+saved_start, vc->add_bias, vc->blocksize_0/2, step);
1649 retlen=saved_start+vc->blocksize_0/2; 1670 retlen=saved_start+vc->blocksize_0/2;
1650 // -- save 1671 // -- save
1651 buf += vc->blocksize_0/2; 1672 buf += vc->blocksize_0/2;
1652 swin += vc->blocksize_0/2-1; 1673 vc->dsp.vector_fmul_reverse(saved, buf, swin, vc->blocksize_0/2);
1653 for(i=0;i<vc->blocksize_0/2;++i) {
1654 saved[i]=buf[i]*swin[-i];
1655 }
1656 saved_start=0; 1674 saved_start=0;
1657 } 1675 }
1658 } 1676 }
1659 vc->saved_start=saved_start; 1677 vc->saved_start=saved_start;
1660 1678
1693 return buf_size ; 1711 return buf_size ;
1694 } 1712 }
1695 1713
1696 AV_DEBUG("parsed %d bytes %d bits, returned %d samples (*ch*bits) \n", get_bits_count(gb)/8, get_bits_count(gb)%8, len); 1714 AV_DEBUG("parsed %d bytes %d bits, returned %d samples (*ch*bits) \n", get_bits_count(gb)/8, get_bits_count(gb)%8, len);
1697 1715
1698 for(i=0;i<len;++i) { 1716 vc->dsp.float_to_int16(data, vc->ret, len);
1699 int_fast32_t tmp= ((int32_t*)vc->ret)[i];
1700 if(tmp & 0xf0000){
1701 // tmp= (0x43c0ffff - tmp)>>31; //ask gcc devs why this is slower
1702 if(tmp > 0x43c0ffff) tmp= 0xFFFF;
1703 else tmp= 0;
1704 }
1705 ((int16_t*)data)[i]=tmp - 0x8000;
1706 }
1707
1708 *data_size=len*2; 1717 *data_size=len*2;
1709 1718
1710 return buf_size ; 1719 return buf_size ;
1711 } 1720 }
1712 1721