comparison aac.c @ 10874:bcfe2acbf190 libavcodec

AAC: Compress codebook tables and optimise sign bit handling The codebooks each consist of small number of values repeated in groups of 2 or 4. Storing the codebooks as a packed list of 2- or 4-bit indexes into a table reduces their size substantially (from 7.5k to 1.5k), resulting in less cache pressure. For the band types with sign bits in the bitstream, storing the number and position of non-zero codebook values using a few bits avoids multiple get_bits() calls and floating-point comparisons which gcc handles miserably. Some float/int type punning also avoids gcc brain damage. Overall speedup 20-35% on Cortex-A8, 20% on Core i7.
author mru
date Wed, 13 Jan 2010 16:46:28 +0000
parents e8f4b9e41b7a
children 6a6a1c2d8745
comparison
equal deleted inserted replaced
10873:fb42dfc877cc 10874:bcfe2acbf190
99 }; 99 };
100 100
101 static VLC vlc_scalefactors; 101 static VLC vlc_scalefactors;
102 static VLC vlc_spectral[11]; 102 static VLC vlc_spectral[11];
103 103
104 static float cbrt_tab[1<<13]; 104 static uint32_t cbrt_tab[1<<13];
105 105
106 static ChannelElement *get_che(AACContext *ac, int type, int elem_id) 106 static ChannelElement *get_che(AACContext *ac, int type, int elem_id)
107 { 107 {
108 if (ac->tag_che_map[type][elem_id]) { 108 if (ac->tag_che_map[type][elem_id]) {
109 return ac->tag_che_map[type][elem_id]; 109 return ac->tag_che_map[type][elem_id];
554 ff_kbd_window_init(ff_aac_kbd_long_1024, 4.0, 1024); 554 ff_kbd_window_init(ff_aac_kbd_long_1024, 4.0, 1024);
555 ff_kbd_window_init(ff_aac_kbd_short_128, 6.0, 128); 555 ff_kbd_window_init(ff_aac_kbd_short_128, 6.0, 128);
556 ff_init_ff_sine_windows(10); 556 ff_init_ff_sine_windows(10);
557 ff_init_ff_sine_windows( 7); 557 ff_init_ff_sine_windows( 7);
558 558
559 if (!cbrt_tab[(1<<13) - 1]) 559 if (!cbrt_tab[(1<<13) - 1]) {
560 for (i = 0; i < 1<<13; i++) 560 for (i = 0; i < 1<<13; i++) {
561 cbrt_tab[i] = cbrtf(i) * i; 561 union float754 f;
562 f.f = cbrtf(i) * i;
563 cbrt_tab[i] = f.i;
564 }
565 }
562 566
563 return 0; 567 return 0;
564 } 568 }
565 569
566 /** 570 /**
856 } else if (ms_present == 2) { 860 } else if (ms_present == 2) {
857 memset(cpe->ms_mask, 1, cpe->ch[0].ics.num_window_groups * cpe->ch[0].ics.max_sfb * sizeof(cpe->ms_mask[0])); 861 memset(cpe->ms_mask, 1, cpe->ch[0].ics.num_window_groups * cpe->ch[0].ics.max_sfb * sizeof(cpe->ms_mask[0]));
858 } 862 }
859 } 863 }
860 864
865 static inline float *VMUL2(float *dst, const float *v, unsigned idx,
866 const float *scale)
867 {
868 float s = *scale;
869 *dst++ = v[idx & 15] * s;
870 *dst++ = v[idx>>4 & 15] * s;
871 return dst;
872 }
873
874 static inline float *VMUL4(float *dst, const float *v, unsigned idx,
875 const float *scale)
876 {
877 float s = *scale;
878 *dst++ = v[idx & 3] * s;
879 *dst++ = v[idx>>2 & 3] * s;
880 *dst++ = v[idx>>4 & 3] * s;
881 *dst++ = v[idx>>6 & 3] * s;
882 return dst;
883 }
884
885 static inline float *VMUL2S(float *dst, const float *v, unsigned idx,
886 unsigned sign, const float *scale)
887 {
888 union float754 s0, s1;
889
890 s0.f = s1.f = *scale;
891 s0.i ^= sign >> 1 << 31;
892 s1.i ^= sign << 31;
893
894 *dst++ = v[idx & 15] * s0.f;
895 *dst++ = v[idx>>4 & 15] * s1.f;
896
897 return dst;
898 }
899
900 static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
901 unsigned sign, const float *scale)
902 {
903 unsigned nz = idx >> 12;
904 union float754 s = { .f = *scale };
905 union float754 t;
906
907 t.i = s.i ^ (sign & 1<<31);
908 *dst++ = v[idx & 3] * t.f;
909
910 sign <<= nz & 1; nz >>= 1;
911 t.i = s.i ^ (sign & 1<<31);
912 *dst++ = v[idx>>2 & 3] * t.f;
913
914 sign <<= nz & 1; nz >>= 1;
915 t.i = s.i ^ (sign & 1<<31);
916 *dst++ = v[idx>>4 & 3] * t.f;
917
918 sign <<= nz & 1; nz >>= 1;
919 t.i = s.i ^ (sign & 1<<31);
920 *dst++ = v[idx>>6 & 3] * t.f;
921
922 return dst;
923 }
924
861 /** 925 /**
862 * Decode spectral data; reference: table 4.50. 926 * Decode spectral data; reference: table 4.50.
863 * Dequantize and scale spectral data; reference: 4.6.3.3. 927 * Dequantize and scale spectral data; reference: 4.6.3.3.
864 * 928 *
865 * @param coef array of dequantized, scaled spectral data 929 * @param coef array of dequantized, scaled spectral data
878 { 942 {
879 int i, k, g, idx = 0; 943 int i, k, g, idx = 0;
880 const int c = 1024 / ics->num_windows; 944 const int c = 1024 / ics->num_windows;
881 const uint16_t *offsets = ics->swb_offset; 945 const uint16_t *offsets = ics->swb_offset;
882 float *coef_base = coef; 946 float *coef_base = coef;
883 static const float sign_lookup[] = { 1.0f, -1.0f }; 947 int err_idx;
884 948
885 for (g = 0; g < ics->num_windows; g++) 949 for (g = 0; g < ics->num_windows; g++)
886 memset(coef + g * 128 + offsets[ics->max_sfb], 0, sizeof(float) * (c - offsets[ics->max_sfb])); 950 memset(coef + g * 128 + offsets[ics->max_sfb], 0, sizeof(float) * (c - offsets[ics->max_sfb]));
887 951
888 for (g = 0; g < ics->num_window_groups; g++) { 952 for (g = 0; g < ics->num_window_groups; g++) {
889 for (i = 0; i < ics->max_sfb; i++, idx++) { 953 for (i = 0; i < ics->max_sfb; i++, idx++) {
890 const int cur_band_type = band_type[idx]; 954 const int cur_band_type = band_type[idx];
891 const int dim = cur_band_type >= FIRST_PAIR_BT ? 2 : 4;
892 const int is_cb_unsigned = IS_CODEBOOK_UNSIGNED(cur_band_type);
893 int group; 955 int group;
894 if (cur_band_type == ZERO_BT || cur_band_type == INTENSITY_BT2 || cur_band_type == INTENSITY_BT) { 956 if (cur_band_type == ZERO_BT || cur_band_type == INTENSITY_BT2 || cur_band_type == INTENSITY_BT) {
895 for (group = 0; group < ics->group_len[g]; group++) { 957 for (group = 0; group < ics->group_len[g]; group++) {
896 memset(coef + group * 128 + offsets[i], 0, (offsets[i + 1] - offsets[i]) * sizeof(float)); 958 memset(coef + group * 128 + offsets[i], 0, (offsets[i + 1] - offsets[i]) * sizeof(float));
897 } 959 }
910 band_energy = ac->dsp.scalarproduct_float(cf, cf, len); 972 band_energy = ac->dsp.scalarproduct_float(cf, cf, len);
911 scale = sf[idx] / sqrtf(band_energy); 973 scale = sf[idx] / sqrtf(band_energy);
912 ac->dsp.vector_fmul_scalar(cf, cf, scale, len); 974 ac->dsp.vector_fmul_scalar(cf, cf, scale, len);
913 } 975 }
914 } else { 976 } else {
977 const float *vq = ff_aac_codebook_vector_vals[cur_band_type-1];
978 const uint16_t *cb_vector_idx = ff_aac_codebook_vector_idx[cur_band_type-1];
979 VLC_TYPE (*vlc_tab)[2] = vlc_spectral[cur_band_type - 1].table;
980 const int cb_size = ff_aac_spectral_sizes[cur_band_type-1];
981
915 for (group = 0; group < ics->group_len[g]; group++) { 982 for (group = 0; group < ics->group_len[g]; group++) {
916 const float *vq[96];
917 const float **vqp = vq;
918 float *cf = coef + (group << 7) + offsets[i]; 983 float *cf = coef + (group << 7) + offsets[i];
984 uint32_t *icf = (uint32_t *) cf;
919 int len = offsets[i + 1] - offsets[i]; 985 int len = offsets[i + 1] - offsets[i];
920 986
921 for (k = offsets[i]; k < offsets[i + 1]; k += dim) { 987 switch ((cur_band_type-1) >> 1) {
922 const int index = get_vlc2(gb, vlc_spectral[cur_band_type - 1].table, 6, 3); 988 case 0:
923 const int coef_tmp_idx = (group << 7) + k; 989 do {
924 const float *vq_ptr; 990 const int index = get_vlc2(gb, vlc_tab, 6, 3);
925 int j; 991 unsigned cb_idx;
926 if (index >= ff_aac_spectral_sizes[cur_band_type - 1]) { 992
927 av_log(ac->avccontext, AV_LOG_ERROR, 993 if (index >= cb_size) {
928 "Read beyond end of ff_aac_codebook_vectors[%d][]. index %d >= %d\n", 994 err_idx = index;
929 cur_band_type - 1, index, ff_aac_spectral_sizes[cur_band_type - 1]); 995 goto err_cb_overflow;
930 return -1;
931 }
932 vq_ptr = &ff_aac_codebook_vectors[cur_band_type - 1][index * dim];
933 *vqp++ = vq_ptr;
934 if (is_cb_unsigned) {
935 if (vq_ptr[0])
936 coef[coef_tmp_idx ] = sign_lookup[get_bits1(gb)];
937 if (vq_ptr[1])
938 coef[coef_tmp_idx + 1] = sign_lookup[get_bits1(gb)];
939 if (dim == 4) {
940 if (vq_ptr[2])
941 coef[coef_tmp_idx + 2] = sign_lookup[get_bits1(gb)];
942 if (vq_ptr[3])
943 coef[coef_tmp_idx + 3] = sign_lookup[get_bits1(gb)];
944 } 996 }
945 if (cur_band_type == ESC_BT) { 997
946 for (j = 0; j < 2; j++) { 998 cb_idx = cb_vector_idx[index];
947 if (vq_ptr[j] == 64.0f) { 999 cf = VMUL4(cf, vq, cb_idx, sf + idx);
948 int n = 4; 1000 } while (len -= 4);
949 /* The total length of escape_sequence must be < 22 bits according 1001 break;
950 to the specification (i.e. max is 111111110xxxxxxxxxxxx). */ 1002 case 1:
951 while (get_bits1(gb) && n < 13) n++; 1003 do {
952 if (n == 13) { 1004 const int index = get_vlc2(gb, vlc_tab, 6, 3);
953 av_log(ac->avccontext, AV_LOG_ERROR, "error in spectral data, ESC overflow\n"); 1005 unsigned nnz;
954 return -1; 1006 unsigned cb_idx;
955 } 1007 uint32_t bits;
956 n = (1 << n) + get_bits(gb, n); 1008
957 coef[coef_tmp_idx + j] *= cbrt_tab[n]; 1009 if (index >= cb_size) {
958 } else 1010 err_idx = index;
959 coef[coef_tmp_idx + j] *= vq_ptr[j]; 1011 goto err_cb_overflow;
1012 }
1013
1014 cb_idx = cb_vector_idx[index];
1015 nnz = cb_idx >> 8 & 15;
1016 bits = get_bits(gb, nnz) << (32-nnz);
1017 cf = VMUL4S(cf, vq, cb_idx, bits, sf + idx);
1018 } while (len -= 4);
1019 break;
1020 case 2:
1021 do {
1022 const int index = get_vlc2(gb, vlc_tab, 6, 3);
1023 unsigned cb_idx;
1024
1025 if (index >= cb_size) {
1026 err_idx = index;
1027 goto err_cb_overflow;
1028 }
1029
1030 cb_idx = cb_vector_idx[index];
1031 cf = VMUL2(cf, vq, cb_idx, sf + idx);
1032 } while (len -= 2);
1033 break;
1034 case 3:
1035 case 4:
1036 do {
1037 const int index = get_vlc2(gb, vlc_tab, 6, 3);
1038 unsigned nnz;
1039 unsigned cb_idx;
1040 unsigned sign;
1041
1042 if (index >= cb_size) {
1043 err_idx = index;
1044 goto err_cb_overflow;
1045 }
1046
1047 cb_idx = cb_vector_idx[index];
1048 nnz = cb_idx >> 8 & 15;
1049 sign = get_bits(gb, nnz) << (cb_idx >> 12);
1050 cf = VMUL2S(cf, vq, cb_idx, sign, sf + idx);
1051 } while (len -= 2);
1052 break;
1053 default:
1054 for (k = 0; k < len; k += 2, icf += 2) {
1055 const int index = get_vlc2(gb, vlc_tab, 6, 3);
1056 unsigned nzt, nnz;
1057 unsigned cb_idx;
1058 uint32_t bits;
1059 int j;
1060
1061 if (!index) {
1062 icf[0] = icf[1] = 0;
1063 continue;
1064 }
1065
1066 if (index >= cb_size) {
1067 err_idx = index;
1068 goto err_cb_overflow;
1069 }
1070
1071 cb_idx = cb_vector_idx[index];
1072 nnz = cb_idx >> 12;
1073 nzt = cb_idx >> 8;
1074 bits = get_bits(gb, nnz) << (32-nnz);
1075
1076 for (j = 0; j < 2; j++) {
1077 if (nzt & 1<<j) {
1078 int n = 4;
1079 /* The total length of escape_sequence must be < 22 bits according
1080 to the specification (i.e. max is 111111110xxxxxxxxxxxx). */
1081 while (get_bits1(gb) && n < 13) n++;
1082 if (n == 13) {
1083 av_log(ac->avccontext, AV_LOG_ERROR, "error in spectral data, ESC overflow\n");
1084 return -1;
1085 }
1086 n = (1 << n) + get_bits(gb, n);
1087 icf[j] = cbrt_tab[n] | (bits & 1<<31);
1088 bits <<= 1;
1089 } else {
1090 unsigned v = ((const uint32_t*)vq)[cb_idx & 15];
1091 icf[j] = (bits & 1<<31) | v;
1092 bits <<= !!v;
960 } 1093 }
1094 cb_idx >>= 4;
961 } 1095 }
962 } 1096 }
963 } 1097
964
965 if (is_cb_unsigned && cur_band_type != ESC_BT) {
966 ac->dsp.vector_fmul_sv_scalar[dim>>2](
967 cf, cf, vq, sf[idx], len);
968 } else if (cur_band_type == ESC_BT) {
969 ac->dsp.vector_fmul_scalar(cf, cf, sf[idx], len); 1098 ac->dsp.vector_fmul_scalar(cf, cf, sf[idx], len);
970 } else { /* !is_cb_unsigned */
971 ac->dsp.sv_fmul_scalar[dim>>2](cf, vq, sf[idx], len);
972 } 1099 }
973 } 1100 }
974 } 1101 }
975 } 1102 }
976 coef += ics->group_len[g] << 7; 1103 coef += ics->group_len[g] << 7;
991 coef_base[ pulse->pos[i] ] = cbrtf(fabsf(ico)) * ico * sf[idx]; 1118 coef_base[ pulse->pos[i] ] = cbrtf(fabsf(ico)) * ico * sf[idx];
992 } 1119 }
993 } 1120 }
994 } 1121 }
995 return 0; 1122 return 0;
1123
1124 err_cb_overflow:
1125 av_log(ac->avccontext, AV_LOG_ERROR,
1126 "Read beyond end of ff_aac_codebook_vectors[%d][]. index %d >= %d\n",
1127 band_type[idx], err_idx, ff_aac_spectral_sizes[band_type[idx]]);
1128 return -1;
996 } 1129 }
997 1130
998 static av_always_inline float flt16_round(float pf) 1131 static av_always_inline float flt16_round(float pf)
999 { 1132 {
1000 union float754 tmp; 1133 union float754 tmp;