comparison aacpsy.c @ 12408:ae72506d4c2a libavcodec

acenc: LAME-inspired window decision This performs quite a bit better than the current 3GPP-inspired window decision on all the samples I have tested. On the castanets.wav sample it performs very similar to iTunes window selection, and seems to perform better than Nero. On fatboy.wav, it seems to perform at least as good as iTunes, if not better. Nero performs horribly on this sample. Patch by: Nathan Caldwell <saintdev@gmail.com>
author alexc
date Mon, 23 Aug 2010 20:00:03 +0000
parents b25537518e40
children 94b578d0af10
comparison
equal deleted inserted replaced
12407:00676ed9b822 12408:ae72506d4c2a
42 #define PSY_3GPP_SPREAD_LOW 1.5f // spreading factor for ascending threshold spreading (15 dB/Bark) 42 #define PSY_3GPP_SPREAD_LOW 1.5f // spreading factor for ascending threshold spreading (15 dB/Bark)
43 #define PSY_3GPP_SPREAD_HI 3.0f // spreading factor for descending threshold spreading (30 dB/Bark) 43 #define PSY_3GPP_SPREAD_HI 3.0f // spreading factor for descending threshold spreading (30 dB/Bark)
44 44
45 #define PSY_3GPP_RPEMIN 0.01f 45 #define PSY_3GPP_RPEMIN 0.01f
46 #define PSY_3GPP_RPELEV 2.0f 46 #define PSY_3GPP_RPELEV 2.0f
47
48 /* LAME psy model constants */
49 #define PSY_LAME_FIR_LEN 21 ///< LAME psy model FIR order
50 #define AAC_BLOCK_SIZE_LONG 1024 ///< long block size
51 #define AAC_BLOCK_SIZE_SHORT 128 ///< short block size
52 #define AAC_NUM_BLOCKS_SHORT 8 ///< number of blocks in a short sequence
53 #define PSY_LAME_NUM_SUBBLOCKS 3 ///< Number of sub-blocks in each short block
54
47 /** 55 /**
48 * @} 56 * @}
49 */ 57 */
50 58
51 /** 59 /**
68 76
69 float win_energy; ///< sliding average of channel energy 77 float win_energy; ///< sliding average of channel energy
70 float iir_state[2]; ///< hi-pass IIR filter state 78 float iir_state[2]; ///< hi-pass IIR filter state
71 uint8_t next_grouping; ///< stored grouping scheme for the next frame (in case of 8 short window sequence) 79 uint8_t next_grouping; ///< stored grouping scheme for the next frame (in case of 8 short window sequence)
72 enum WindowSequence next_window_seq; ///< window sequence to be used in the next frame 80 enum WindowSequence next_window_seq; ///< window sequence to be used in the next frame
81 /* LAME psy model specific members */
82 float attack_threshold; ///< attack threshold for this channel
83 float prev_energy_subshort[AAC_NUM_BLOCKS_SHORT * PSY_LAME_NUM_SUBBLOCKS];
84 int prev_attack; ///< attack value for the last short block in the previous sequence
73 }AacPsyChannel; 85 }AacPsyChannel;
74 86
75 /** 87 /**
76 * psychoacoustic model frame type-dependent coefficients 88 * psychoacoustic model frame type-dependent coefficients
77 */ 89 */
87 */ 99 */
88 typedef struct AacPsyContext{ 100 typedef struct AacPsyContext{
89 AacPsyCoeffs psy_coef[2]; 101 AacPsyCoeffs psy_coef[2];
90 AacPsyChannel *ch; 102 AacPsyChannel *ch;
91 }AacPsyContext; 103 }AacPsyContext;
104
105 /**
106 * LAME psy model preset struct
107 */
108 typedef struct {
109 int quality; ///< Quality to map the rest of the vaules to.
110 /* This is overloaded to be both kbps per channel in ABR mode, and
111 * requested quality in constant quality mode.
112 */
113 float st_lrm; ///< short threshold for L, R, and M channels
114 } PsyLamePreset;
115
116 /**
117 * LAME psy model preset table for ABR
118 */
119 static const PsyLamePreset psy_abr_map[] = {
120 /* TODO: Tuning. These were taken from LAME. */
121 /* kbps/ch st_lrm */
122 { 8, 6.60},
123 { 16, 6.60},
124 { 24, 6.60},
125 { 32, 6.60},
126 { 40, 6.60},
127 { 48, 6.60},
128 { 56, 6.60},
129 { 64, 6.40},
130 { 80, 6.00},
131 { 96, 5.60},
132 {112, 5.20},
133 {128, 5.20},
134 {160, 5.20}
135 };
136
137 /**
138 * LAME psy model preset table for constant quality
139 */
140 static const PsyLamePreset psy_vbr_map[] = {
141 /* vbr_q st_lrm */
142 { 0, 4.20},
143 { 1, 4.20},
144 { 2, 4.20},
145 { 3, 4.20},
146 { 4, 4.20},
147 { 5, 4.20},
148 { 6, 4.20},
149 { 7, 4.20},
150 { 8, 4.20},
151 { 9, 4.20},
152 {10, 4.20}
153 };
154
155 /**
156 * LAME psy model FIR coefficient table
157 */
158 static const float psy_fir_coeffs[] = {
159 -8.65163e-18 * 2, -0.00851586 * 2, -6.74764e-18 * 2, 0.0209036 * 2,
160 -3.36639e-17 * 2, -0.0438162 * 2, -1.54175e-17 * 2, 0.0931738 * 2,
161 -5.52212e-17 * 2, -0.313819 * 2
162 };
163
164 /**
165 * calculates the attack threshold for ABR from the above table for the LAME psy model
166 */
167 static float lame_calc_attack_threshold(int bitrate)
168 {
169 /* Assume max bitrate to start with */
170 int lower_range = 12, upper_range = 12;
171 int lower_range_kbps = psy_abr_map[12].quality;
172 int upper_range_kbps = psy_abr_map[12].quality;
173 int i;
174
175 /* Determine which bitrates the value specified falls between.
176 * If the loop ends without breaking our above assumption of 320kbps was correct.
177 */
178 for (i = 1; i < 13; i++) {
179 if (FFMAX(bitrate, psy_abr_map[i].quality) != bitrate) {
180 upper_range = i;
181 upper_range_kbps = psy_abr_map[i ].quality;
182 lower_range = i - 1;
183 lower_range_kbps = psy_abr_map[i - 1].quality;
184 break; /* Upper range found */
185 }
186 }
187
188 /* Determine which range the value specified is closer to */
189 if ((upper_range_kbps - bitrate) > (bitrate - lower_range_kbps))
190 return psy_abr_map[lower_range].st_lrm;
191 return psy_abr_map[upper_range].st_lrm;
192 }
193
194 /**
195 * LAME psy model specific initialization
196 */
197 static void lame_window_init(AacPsyContext *ctx, AVCodecContext *avctx) {
198 int i;
199
200 for (i = 0; i < avctx->channels; i++) {
201 AacPsyChannel *pch = &ctx->ch[i];
202
203 if (avctx->flags & CODEC_FLAG_QSCALE)
204 pch->attack_threshold = psy_vbr_map[avctx->global_quality / FF_QP2LAMBDA].st_lrm;
205 else
206 pch->attack_threshold = lame_calc_attack_threshold(avctx->bit_rate / avctx->channels / 1000);
207
208 for (i = 0; i < AAC_NUM_BLOCKS_SHORT * PSY_LAME_NUM_SUBBLOCKS; i++)
209 pch->prev_energy_subshort[i] = 10.0f;
210 }
211 }
92 212
93 /** 213 /**
94 * Calculate Bark value for given line. 214 * Calculate Bark value for given line.
95 */ 215 */
96 static av_cold float calc_bark(float f) 216 static av_cold float calc_bark(float f)
146 start += ctx->bands[j][g]; 266 start += ctx->bands[j][g];
147 } 267 }
148 } 268 }
149 269
150 pctx->ch = av_mallocz(sizeof(AacPsyChannel) * ctx->avctx->channels); 270 pctx->ch = av_mallocz(sizeof(AacPsyChannel) * ctx->avctx->channels);
271
272 lame_window_init(pctx, ctx->avctx);
273
151 return 0; 274 return 0;
152 } 275 }
153 276
154 /** 277 /**
155 * IIR filter used in block switching decision 278 * IIR filter used in block switching decision
314 AacPsyContext *pctx = (AacPsyContext*) apc->model_priv_data; 437 AacPsyContext *pctx = (AacPsyContext*) apc->model_priv_data;
315 av_freep(&pctx->ch); 438 av_freep(&pctx->ch);
316 av_freep(&apc->model_priv_data); 439 av_freep(&apc->model_priv_data);
317 } 440 }
318 441
442 static void lame_apply_block_type(AacPsyChannel *ctx, FFPsyWindowInfo *wi, int uselongblock)
443 {
444 int blocktype = ONLY_LONG_SEQUENCE;
445 if (uselongblock) {
446 if (ctx->next_window_seq == EIGHT_SHORT_SEQUENCE)
447 blocktype = LONG_STOP_SEQUENCE;
448 } else {
449 blocktype = EIGHT_SHORT_SEQUENCE;
450 if (ctx->next_window_seq == ONLY_LONG_SEQUENCE)
451 ctx->next_window_seq = LONG_START_SEQUENCE;
452 if (ctx->next_window_seq == LONG_STOP_SEQUENCE)
453 ctx->next_window_seq = EIGHT_SHORT_SEQUENCE;
454 }
455
456 wi->window_type[0] = ctx->next_window_seq;
457 ctx->next_window_seq = blocktype;
458 }
459
460 static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx,
461 const int16_t *audio, const int16_t *la,
462 int channel, int prev_type)
463 {
464 AacPsyContext *pctx = (AacPsyContext*) ctx->model_priv_data;
465 AacPsyChannel *pch = &pctx->ch[channel];
466 int grouping = 0;
467 int uselongblock = 1;
468 int attacks[AAC_NUM_BLOCKS_SHORT + 1] = { 0 };
469 int i;
470 FFPsyWindowInfo wi;
471
472 memset(&wi, 0, sizeof(wi));
473 if (la) {
474 float hpfsmpl[AAC_BLOCK_SIZE_LONG];
475 float const *pf = hpfsmpl;
476 float attack_intensity[(AAC_NUM_BLOCKS_SHORT + 1) * PSY_LAME_NUM_SUBBLOCKS];
477 float energy_subshort[(AAC_NUM_BLOCKS_SHORT + 1) * PSY_LAME_NUM_SUBBLOCKS];
478 float energy_short[AAC_NUM_BLOCKS_SHORT + 1] = { 0 };
479 int chans = ctx->avctx->channels;
480 const int16_t *firbuf = la + (AAC_BLOCK_SIZE_SHORT/4 - PSY_LAME_FIR_LEN) * chans;
481 int j, att_sum = 0;
482
483 /* LAME comment: apply high pass filter of fs/4 */
484 for (i = 0; i < AAC_BLOCK_SIZE_LONG; i++) {
485 float sum1, sum2;
486 sum1 = firbuf[(i + ((PSY_LAME_FIR_LEN - 1) / 2)) * chans];
487 sum2 = 0.0;
488 for (j = 0; j < ((PSY_LAME_FIR_LEN - 1) / 2) - 1; j += 2) {
489 sum1 += psy_fir_coeffs[j] * (firbuf[(i + j) * chans] + firbuf[(i + PSY_LAME_FIR_LEN - j) * chans]);
490 sum2 += psy_fir_coeffs[j + 1] * (firbuf[(i + j + 1) * chans] + firbuf[(i + PSY_LAME_FIR_LEN - j - 1) * chans]);
491 }
492 hpfsmpl[i] = sum1 + sum2;
493 }
494
495 /* Calculate the energies of each sub-shortblock */
496 for (i = 0; i < PSY_LAME_NUM_SUBBLOCKS; i++) {
497 energy_subshort[i] = pch->prev_energy_subshort[i + ((AAC_NUM_BLOCKS_SHORT - 1) * PSY_LAME_NUM_SUBBLOCKS)];
498 assert(pch->prev_energy_subshort[i + ((AAC_NUM_BLOCKS_SHORT - 2) * PSY_LAME_NUM_SUBBLOCKS + 1)] > 0);
499 attack_intensity[i] = energy_subshort[i] / pch->prev_energy_subshort[i + ((AAC_NUM_BLOCKS_SHORT - 2) * PSY_LAME_NUM_SUBBLOCKS + 1)];
500 energy_short[0] += energy_subshort[i];
501 }
502
503 for (i = 0; i < AAC_NUM_BLOCKS_SHORT * PSY_LAME_NUM_SUBBLOCKS; i++) {
504 float const *const pfe = pf + AAC_BLOCK_SIZE_LONG / (AAC_NUM_BLOCKS_SHORT * PSY_LAME_NUM_SUBBLOCKS);
505 float p = 1.0f;
506 for (; pf < pfe; pf++)
507 if (p < fabsf(*pf))
508 p = fabsf(*pf);
509 pch->prev_energy_subshort[i] = energy_subshort[i + PSY_LAME_NUM_SUBBLOCKS] = p;
510 energy_short[1 + i / PSY_LAME_NUM_SUBBLOCKS] += p;
511 /* FIXME: The indexes below are [i + 3 - 2] in the LAME source.
512 * Obviously the 3 and 2 have some significance, or this would be just [i + 1]
513 * (which is what we use here). What the 3 stands for is ambigious, as it is both
514 * number of short blocks, and the number of sub-short blocks.
515 * It seems that LAME is comparing each sub-block to sub-block + 1 in the
516 * previous block.
517 */
518 if (p > energy_subshort[i + 1])
519 p = p / energy_subshort[i + 1];
520 else if (energy_subshort[i + 1] > p * 10.0f)
521 p = energy_subshort[i + 1] / (p * 10.0f);
522 else
523 p = 0.0;
524 attack_intensity[i + PSY_LAME_NUM_SUBBLOCKS] = p;
525 }
526
527 /* compare energy between sub-short blocks */
528 for (i = 0; i < (AAC_NUM_BLOCKS_SHORT + 1) * PSY_LAME_NUM_SUBBLOCKS; i++)
529 if (!attacks[i / PSY_LAME_NUM_SUBBLOCKS])
530 if (attack_intensity[i] > pch->attack_threshold)
531 attacks[i / PSY_LAME_NUM_SUBBLOCKS] = (i % PSY_LAME_NUM_SUBBLOCKS) + 1;
532
533 /* should have energy change between short blocks, in order to avoid periodic signals */
534 /* Good samples to show the effect are Trumpet test songs */
535 /* GB: tuned (1) to avoid too many short blocks for test sample TRUMPET */
536 /* RH: tuned (2) to let enough short blocks through for test sample FSOL and SNAPS */
537 for (i = 1; i < AAC_NUM_BLOCKS_SHORT + 1; i++) {
538 float const u = energy_short[i - 1];
539 float const v = energy_short[i];
540 float const m = FFMAX(u, v);
541 if (m < 40000) { /* (2) */
542 if (u < 1.7f * v && v < 1.7f * u) { /* (1) */
543 if (i == 1 && attacks[0] < attacks[i])
544 attacks[0] = 0;
545 attacks[i] = 0;
546 }
547 }
548 att_sum += attacks[i];
549 }
550
551 if (attacks[0] <= pch->prev_attack)
552 attacks[0] = 0;
553
554 att_sum += attacks[0];
555 /* 3 below indicates the previous attack happened in the last sub-block of the previous sequence */
556 if (pch->prev_attack == 3 || att_sum) {
557 uselongblock = 0;
558
559 if (attacks[1] && attacks[0])
560 attacks[1] = 0;
561 if (attacks[2] && attacks[1])
562 attacks[2] = 0;
563 if (attacks[3] && attacks[2])
564 attacks[3] = 0;
565 if (attacks[4] && attacks[3])
566 attacks[4] = 0;
567 if (attacks[5] && attacks[4])
568 attacks[5] = 0;
569 if (attacks[6] && attacks[5])
570 attacks[6] = 0;
571 if (attacks[7] && attacks[6])
572 attacks[7] = 0;
573 if (attacks[8] && attacks[7])
574 attacks[8] = 0;
575 }
576 } else {
577 /* We have no lookahead info, so just use same type as the previous sequence. */
578 uselongblock = !(prev_type == EIGHT_SHORT_SEQUENCE);
579 }
580
581 lame_apply_block_type(pch, &wi, uselongblock);
582
583 wi.window_type[1] = prev_type;
584 if (wi.window_type[0] != EIGHT_SHORT_SEQUENCE) {
585 wi.num_windows = 1;
586 wi.grouping[0] = 1;
587 if (wi.window_type[0] == LONG_START_SEQUENCE)
588 wi.window_shape = 0;
589 else
590 wi.window_shape = 1;
591 } else {
592 int lastgrp = 0;
593
594 wi.num_windows = 8;
595 wi.window_shape = 0;
596 for (i = 0; i < 8; i++) {
597 if (!((pch->next_grouping >> i) & 1))
598 lastgrp = i;
599 wi.grouping[lastgrp]++;
600 }
601 }
602
603 /* Determine grouping, based on the location of the first attack, and save for
604 * the next frame.
605 * FIXME: Move this to analysis.
606 * TODO: Tune groupings depending on attack location
607 * TODO: Handle more than one attack in a group
608 */
609 for (i = 0; i < 9; i++) {
610 if (attacks[i]) {
611 grouping = i;
612 break;
613 }
614 }
615 pch->next_grouping = window_grouping[grouping];
616
617 pch->prev_attack = attacks[8];
618
619 return wi;
620 }
319 621
320 const FFPsyModel ff_aac_psy_model = 622 const FFPsyModel ff_aac_psy_model =
321 { 623 {
322 .name = "3GPP TS 26.403-inspired model", 624 .name = "3GPP TS 26.403-inspired model",
323 .init = psy_3gpp_init, 625 .init = psy_3gpp_init,
324 .window = psy_3gpp_window, 626 .window = psy_lame_window,
325 .analyze = psy_3gpp_analyze, 627 .analyze = psy_3gpp_analyze,
326 .end = psy_3gpp_end, 628 .end = psy_3gpp_end,
327 }; 629 };