Mercurial > libavcodec.hg
diff aac.c @ 10874:bcfe2acbf190 libavcodec
AAC: Compress codebook tables and optimise sign bit handling
The codebooks each consist of small number of values repeated in
groups of 2 or 4. Storing the codebooks as a packed list of 2- or
4-bit indexes into a table reduces their size substantially (from 7.5k
to 1.5k), resulting in less cache pressure.
For the band types with sign bits in the bitstream, storing the number
and position of non-zero codebook values using a few bits avoids
multiple get_bits() calls and floating-point comparisons which gcc
handles miserably.
Some float/int type punning also avoids gcc brain damage.
Overall speedup 20-35% on Cortex-A8, 20% on Core i7.
author | mru |
---|---|
date | Wed, 13 Jan 2010 16:46:28 +0000 |
parents | e8f4b9e41b7a |
children | 6a6a1c2d8745 |
line wrap: on
line diff
--- a/aac.c Wed Jan 13 04:35:19 2010 +0000 +++ b/aac.c Wed Jan 13 16:46:28 2010 +0000 @@ -101,7 +101,7 @@ static VLC vlc_scalefactors; static VLC vlc_spectral[11]; -static float cbrt_tab[1<<13]; +static uint32_t cbrt_tab[1<<13]; static ChannelElement *get_che(AACContext *ac, int type, int elem_id) { @@ -556,9 +556,13 @@ ff_init_ff_sine_windows(10); ff_init_ff_sine_windows( 7); - if (!cbrt_tab[(1<<13) - 1]) - for (i = 0; i < 1<<13; i++) - cbrt_tab[i] = cbrtf(i) * i; + if (!cbrt_tab[(1<<13) - 1]) { + for (i = 0; i < 1<<13; i++) { + union float754 f; + f.f = cbrtf(i) * i; + cbrt_tab[i] = f.i; + } + } return 0; } @@ -858,6 +862,66 @@ } } +static inline float *VMUL2(float *dst, const float *v, unsigned idx, + const float *scale) +{ + float s = *scale; + *dst++ = v[idx & 15] * s; + *dst++ = v[idx>>4 & 15] * s; + return dst; +} + +static inline float *VMUL4(float *dst, const float *v, unsigned idx, + const float *scale) +{ + float s = *scale; + *dst++ = v[idx & 3] * s; + *dst++ = v[idx>>2 & 3] * s; + *dst++ = v[idx>>4 & 3] * s; + *dst++ = v[idx>>6 & 3] * s; + return dst; +} + +static inline float *VMUL2S(float *dst, const float *v, unsigned idx, + unsigned sign, const float *scale) +{ + union float754 s0, s1; + + s0.f = s1.f = *scale; + s0.i ^= sign >> 1 << 31; + s1.i ^= sign << 31; + + *dst++ = v[idx & 15] * s0.f; + *dst++ = v[idx>>4 & 15] * s1.f; + + return dst; +} + +static inline float *VMUL4S(float *dst, const float *v, unsigned idx, + unsigned sign, const float *scale) +{ + unsigned nz = idx >> 12; + union float754 s = { .f = *scale }; + union float754 t; + + t.i = s.i ^ (sign & 1<<31); + *dst++ = v[idx & 3] * t.f; + + sign <<= nz & 1; nz >>= 1; + t.i = s.i ^ (sign & 1<<31); + *dst++ = v[idx>>2 & 3] * t.f; + + sign <<= nz & 1; nz >>= 1; + t.i = s.i ^ (sign & 1<<31); + *dst++ = v[idx>>4 & 3] * t.f; + + sign <<= nz & 1; nz >>= 1; + t.i = s.i ^ (sign & 1<<31); + *dst++ = v[idx>>6 & 3] * t.f; + + return dst; +} + /** * Decode spectral data; reference: table 4.50. * Dequantize and scale spectral data; reference: 4.6.3.3. @@ -880,7 +944,7 @@ const int c = 1024 / ics->num_windows; const uint16_t *offsets = ics->swb_offset; float *coef_base = coef; - static const float sign_lookup[] = { 1.0f, -1.0f }; + int err_idx; for (g = 0; g < ics->num_windows; g++) memset(coef + g * 128 + offsets[ics->max_sfb], 0, sizeof(float) * (c - offsets[ics->max_sfb])); @@ -888,8 +952,6 @@ for (g = 0; g < ics->num_window_groups; g++) { for (i = 0; i < ics->max_sfb; i++, idx++) { const int cur_band_type = band_type[idx]; - const int dim = cur_band_type >= FIRST_PAIR_BT ? 2 : 4; - const int is_cb_unsigned = IS_CODEBOOK_UNSIGNED(cur_band_type); int group; if (cur_band_type == ZERO_BT || cur_band_type == INTENSITY_BT2 || cur_band_type == INTENSITY_BT) { for (group = 0; group < ics->group_len[g]; group++) { @@ -912,63 +974,128 @@ ac->dsp.vector_fmul_scalar(cf, cf, scale, len); } } else { + const float *vq = ff_aac_codebook_vector_vals[cur_band_type-1]; + const uint16_t *cb_vector_idx = ff_aac_codebook_vector_idx[cur_band_type-1]; + VLC_TYPE (*vlc_tab)[2] = vlc_spectral[cur_band_type - 1].table; + const int cb_size = ff_aac_spectral_sizes[cur_band_type-1]; + for (group = 0; group < ics->group_len[g]; group++) { - const float *vq[96]; - const float **vqp = vq; float *cf = coef + (group << 7) + offsets[i]; + uint32_t *icf = (uint32_t *) cf; int len = offsets[i + 1] - offsets[i]; - for (k = offsets[i]; k < offsets[i + 1]; k += dim) { - const int index = get_vlc2(gb, vlc_spectral[cur_band_type - 1].table, 6, 3); - const int coef_tmp_idx = (group << 7) + k; - const float *vq_ptr; - int j; - if (index >= ff_aac_spectral_sizes[cur_band_type - 1]) { - av_log(ac->avccontext, AV_LOG_ERROR, - "Read beyond end of ff_aac_codebook_vectors[%d][]. index %d >= %d\n", - cur_band_type - 1, index, ff_aac_spectral_sizes[cur_band_type - 1]); - return -1; - } - vq_ptr = &ff_aac_codebook_vectors[cur_band_type - 1][index * dim]; - *vqp++ = vq_ptr; - if (is_cb_unsigned) { - if (vq_ptr[0]) - coef[coef_tmp_idx ] = sign_lookup[get_bits1(gb)]; - if (vq_ptr[1]) - coef[coef_tmp_idx + 1] = sign_lookup[get_bits1(gb)]; - if (dim == 4) { - if (vq_ptr[2]) - coef[coef_tmp_idx + 2] = sign_lookup[get_bits1(gb)]; - if (vq_ptr[3]) - coef[coef_tmp_idx + 3] = sign_lookup[get_bits1(gb)]; + switch ((cur_band_type-1) >> 1) { + case 0: + do { + const int index = get_vlc2(gb, vlc_tab, 6, 3); + unsigned cb_idx; + + if (index >= cb_size) { + err_idx = index; + goto err_cb_overflow; + } + + cb_idx = cb_vector_idx[index]; + cf = VMUL4(cf, vq, cb_idx, sf + idx); + } while (len -= 4); + break; + case 1: + do { + const int index = get_vlc2(gb, vlc_tab, 6, 3); + unsigned nnz; + unsigned cb_idx; + uint32_t bits; + + if (index >= cb_size) { + err_idx = index; + goto err_cb_overflow; + } + + cb_idx = cb_vector_idx[index]; + nnz = cb_idx >> 8 & 15; + bits = get_bits(gb, nnz) << (32-nnz); + cf = VMUL4S(cf, vq, cb_idx, bits, sf + idx); + } while (len -= 4); + break; + case 2: + do { + const int index = get_vlc2(gb, vlc_tab, 6, 3); + unsigned cb_idx; + + if (index >= cb_size) { + err_idx = index; + goto err_cb_overflow; } - if (cur_band_type == ESC_BT) { - for (j = 0; j < 2; j++) { - if (vq_ptr[j] == 64.0f) { - int n = 4; - /* The total length of escape_sequence must be < 22 bits according - to the specification (i.e. max is 111111110xxxxxxxxxxxx). */ - while (get_bits1(gb) && n < 13) n++; - if (n == 13) { - av_log(ac->avccontext, AV_LOG_ERROR, "error in spectral data, ESC overflow\n"); - return -1; - } - n = (1 << n) + get_bits(gb, n); - coef[coef_tmp_idx + j] *= cbrt_tab[n]; - } else - coef[coef_tmp_idx + j] *= vq_ptr[j]; + + cb_idx = cb_vector_idx[index]; + cf = VMUL2(cf, vq, cb_idx, sf + idx); + } while (len -= 2); + break; + case 3: + case 4: + do { + const int index = get_vlc2(gb, vlc_tab, 6, 3); + unsigned nnz; + unsigned cb_idx; + unsigned sign; + + if (index >= cb_size) { + err_idx = index; + goto err_cb_overflow; + } + + cb_idx = cb_vector_idx[index]; + nnz = cb_idx >> 8 & 15; + sign = get_bits(gb, nnz) << (cb_idx >> 12); + cf = VMUL2S(cf, vq, cb_idx, sign, sf + idx); + } while (len -= 2); + break; + default: + for (k = 0; k < len; k += 2, icf += 2) { + const int index = get_vlc2(gb, vlc_tab, 6, 3); + unsigned nzt, nnz; + unsigned cb_idx; + uint32_t bits; + int j; + + if (!index) { + icf[0] = icf[1] = 0; + continue; + } + + if (index >= cb_size) { + err_idx = index; + goto err_cb_overflow; + } + + cb_idx = cb_vector_idx[index]; + nnz = cb_idx >> 12; + nzt = cb_idx >> 8; + bits = get_bits(gb, nnz) << (32-nnz); + + for (j = 0; j < 2; j++) { + if (nzt & 1<<j) { + int n = 4; + /* The total length of escape_sequence must be < 22 bits according + to the specification (i.e. max is 111111110xxxxxxxxxxxx). */ + while (get_bits1(gb) && n < 13) n++; + if (n == 13) { + av_log(ac->avccontext, AV_LOG_ERROR, "error in spectral data, ESC overflow\n"); + return -1; + } + n = (1 << n) + get_bits(gb, n); + icf[j] = cbrt_tab[n] | (bits & 1<<31); + bits <<= 1; + } else { + unsigned v = ((const uint32_t*)vq)[cb_idx & 15]; + icf[j] = (bits & 1<<31) | v; + bits <<= !!v; } + cb_idx >>= 4; } } - } - if (is_cb_unsigned && cur_band_type != ESC_BT) { - ac->dsp.vector_fmul_sv_scalar[dim>>2]( - cf, cf, vq, sf[idx], len); - } else if (cur_band_type == ESC_BT) { ac->dsp.vector_fmul_scalar(cf, cf, sf[idx], len); - } else { /* !is_cb_unsigned */ - ac->dsp.sv_fmul_scalar[dim>>2](cf, vq, sf[idx], len); } } } @@ -993,6 +1120,12 @@ } } return 0; + +err_cb_overflow: + av_log(ac->avccontext, AV_LOG_ERROR, + "Read beyond end of ff_aac_codebook_vectors[%d][]. index %d >= %d\n", + band_type[idx], err_idx, ff_aac_spectral_sizes[band_type[idx]]); + return -1; } static av_always_inline float flt16_round(float pf)