Mercurial > libavcodec.hg
comparison aacpsy.c @ 12408:ae72506d4c2a libavcodec
acenc: LAME-inspired window decision
This performs quite a bit better than the current 3GPP-inspired window decision
on all the samples I have tested. On the castanets.wav sample it performs very
similar to iTunes window selection, and seems to perform better than Nero.
On fatboy.wav, it seems to perform at least as good as iTunes, if not better.
Nero performs horribly on this sample.
Patch by: Nathan Caldwell <saintdev@gmail.com>
author | alexc |
---|---|
date | Mon, 23 Aug 2010 20:00:03 +0000 |
parents | b25537518e40 |
children | 94b578d0af10 |
comparison
equal
deleted
inserted
replaced
12407:00676ed9b822 | 12408:ae72506d4c2a |
---|---|
42 #define PSY_3GPP_SPREAD_LOW 1.5f // spreading factor for ascending threshold spreading (15 dB/Bark) | 42 #define PSY_3GPP_SPREAD_LOW 1.5f // spreading factor for ascending threshold spreading (15 dB/Bark) |
43 #define PSY_3GPP_SPREAD_HI 3.0f // spreading factor for descending threshold spreading (30 dB/Bark) | 43 #define PSY_3GPP_SPREAD_HI 3.0f // spreading factor for descending threshold spreading (30 dB/Bark) |
44 | 44 |
45 #define PSY_3GPP_RPEMIN 0.01f | 45 #define PSY_3GPP_RPEMIN 0.01f |
46 #define PSY_3GPP_RPELEV 2.0f | 46 #define PSY_3GPP_RPELEV 2.0f |
47 | |
48 /* LAME psy model constants */ | |
49 #define PSY_LAME_FIR_LEN 21 ///< LAME psy model FIR order | |
50 #define AAC_BLOCK_SIZE_LONG 1024 ///< long block size | |
51 #define AAC_BLOCK_SIZE_SHORT 128 ///< short block size | |
52 #define AAC_NUM_BLOCKS_SHORT 8 ///< number of blocks in a short sequence | |
53 #define PSY_LAME_NUM_SUBBLOCKS 3 ///< Number of sub-blocks in each short block | |
54 | |
47 /** | 55 /** |
48 * @} | 56 * @} |
49 */ | 57 */ |
50 | 58 |
51 /** | 59 /** |
68 | 76 |
69 float win_energy; ///< sliding average of channel energy | 77 float win_energy; ///< sliding average of channel energy |
70 float iir_state[2]; ///< hi-pass IIR filter state | 78 float iir_state[2]; ///< hi-pass IIR filter state |
71 uint8_t next_grouping; ///< stored grouping scheme for the next frame (in case of 8 short window sequence) | 79 uint8_t next_grouping; ///< stored grouping scheme for the next frame (in case of 8 short window sequence) |
72 enum WindowSequence next_window_seq; ///< window sequence to be used in the next frame | 80 enum WindowSequence next_window_seq; ///< window sequence to be used in the next frame |
81 /* LAME psy model specific members */ | |
82 float attack_threshold; ///< attack threshold for this channel | |
83 float prev_energy_subshort[AAC_NUM_BLOCKS_SHORT * PSY_LAME_NUM_SUBBLOCKS]; | |
84 int prev_attack; ///< attack value for the last short block in the previous sequence | |
73 }AacPsyChannel; | 85 }AacPsyChannel; |
74 | 86 |
75 /** | 87 /** |
76 * psychoacoustic model frame type-dependent coefficients | 88 * psychoacoustic model frame type-dependent coefficients |
77 */ | 89 */ |
87 */ | 99 */ |
88 typedef struct AacPsyContext{ | 100 typedef struct AacPsyContext{ |
89 AacPsyCoeffs psy_coef[2]; | 101 AacPsyCoeffs psy_coef[2]; |
90 AacPsyChannel *ch; | 102 AacPsyChannel *ch; |
91 }AacPsyContext; | 103 }AacPsyContext; |
104 | |
105 /** | |
106 * LAME psy model preset struct | |
107 */ | |
108 typedef struct { | |
109 int quality; ///< Quality to map the rest of the vaules to. | |
110 /* This is overloaded to be both kbps per channel in ABR mode, and | |
111 * requested quality in constant quality mode. | |
112 */ | |
113 float st_lrm; ///< short threshold for L, R, and M channels | |
114 } PsyLamePreset; | |
115 | |
116 /** | |
117 * LAME psy model preset table for ABR | |
118 */ | |
119 static const PsyLamePreset psy_abr_map[] = { | |
120 /* TODO: Tuning. These were taken from LAME. */ | |
121 /* kbps/ch st_lrm */ | |
122 { 8, 6.60}, | |
123 { 16, 6.60}, | |
124 { 24, 6.60}, | |
125 { 32, 6.60}, | |
126 { 40, 6.60}, | |
127 { 48, 6.60}, | |
128 { 56, 6.60}, | |
129 { 64, 6.40}, | |
130 { 80, 6.00}, | |
131 { 96, 5.60}, | |
132 {112, 5.20}, | |
133 {128, 5.20}, | |
134 {160, 5.20} | |
135 }; | |
136 | |
137 /** | |
138 * LAME psy model preset table for constant quality | |
139 */ | |
140 static const PsyLamePreset psy_vbr_map[] = { | |
141 /* vbr_q st_lrm */ | |
142 { 0, 4.20}, | |
143 { 1, 4.20}, | |
144 { 2, 4.20}, | |
145 { 3, 4.20}, | |
146 { 4, 4.20}, | |
147 { 5, 4.20}, | |
148 { 6, 4.20}, | |
149 { 7, 4.20}, | |
150 { 8, 4.20}, | |
151 { 9, 4.20}, | |
152 {10, 4.20} | |
153 }; | |
154 | |
155 /** | |
156 * LAME psy model FIR coefficient table | |
157 */ | |
158 static const float psy_fir_coeffs[] = { | |
159 -8.65163e-18 * 2, -0.00851586 * 2, -6.74764e-18 * 2, 0.0209036 * 2, | |
160 -3.36639e-17 * 2, -0.0438162 * 2, -1.54175e-17 * 2, 0.0931738 * 2, | |
161 -5.52212e-17 * 2, -0.313819 * 2 | |
162 }; | |
163 | |
164 /** | |
165 * calculates the attack threshold for ABR from the above table for the LAME psy model | |
166 */ | |
167 static float lame_calc_attack_threshold(int bitrate) | |
168 { | |
169 /* Assume max bitrate to start with */ | |
170 int lower_range = 12, upper_range = 12; | |
171 int lower_range_kbps = psy_abr_map[12].quality; | |
172 int upper_range_kbps = psy_abr_map[12].quality; | |
173 int i; | |
174 | |
175 /* Determine which bitrates the value specified falls between. | |
176 * If the loop ends without breaking our above assumption of 320kbps was correct. | |
177 */ | |
178 for (i = 1; i < 13; i++) { | |
179 if (FFMAX(bitrate, psy_abr_map[i].quality) != bitrate) { | |
180 upper_range = i; | |
181 upper_range_kbps = psy_abr_map[i ].quality; | |
182 lower_range = i - 1; | |
183 lower_range_kbps = psy_abr_map[i - 1].quality; | |
184 break; /* Upper range found */ | |
185 } | |
186 } | |
187 | |
188 /* Determine which range the value specified is closer to */ | |
189 if ((upper_range_kbps - bitrate) > (bitrate - lower_range_kbps)) | |
190 return psy_abr_map[lower_range].st_lrm; | |
191 return psy_abr_map[upper_range].st_lrm; | |
192 } | |
193 | |
194 /** | |
195 * LAME psy model specific initialization | |
196 */ | |
197 static void lame_window_init(AacPsyContext *ctx, AVCodecContext *avctx) { | |
198 int i; | |
199 | |
200 for (i = 0; i < avctx->channels; i++) { | |
201 AacPsyChannel *pch = &ctx->ch[i]; | |
202 | |
203 if (avctx->flags & CODEC_FLAG_QSCALE) | |
204 pch->attack_threshold = psy_vbr_map[avctx->global_quality / FF_QP2LAMBDA].st_lrm; | |
205 else | |
206 pch->attack_threshold = lame_calc_attack_threshold(avctx->bit_rate / avctx->channels / 1000); | |
207 | |
208 for (i = 0; i < AAC_NUM_BLOCKS_SHORT * PSY_LAME_NUM_SUBBLOCKS; i++) | |
209 pch->prev_energy_subshort[i] = 10.0f; | |
210 } | |
211 } | |
92 | 212 |
93 /** | 213 /** |
94 * Calculate Bark value for given line. | 214 * Calculate Bark value for given line. |
95 */ | 215 */ |
96 static av_cold float calc_bark(float f) | 216 static av_cold float calc_bark(float f) |
146 start += ctx->bands[j][g]; | 266 start += ctx->bands[j][g]; |
147 } | 267 } |
148 } | 268 } |
149 | 269 |
150 pctx->ch = av_mallocz(sizeof(AacPsyChannel) * ctx->avctx->channels); | 270 pctx->ch = av_mallocz(sizeof(AacPsyChannel) * ctx->avctx->channels); |
271 | |
272 lame_window_init(pctx, ctx->avctx); | |
273 | |
151 return 0; | 274 return 0; |
152 } | 275 } |
153 | 276 |
154 /** | 277 /** |
155 * IIR filter used in block switching decision | 278 * IIR filter used in block switching decision |
314 AacPsyContext *pctx = (AacPsyContext*) apc->model_priv_data; | 437 AacPsyContext *pctx = (AacPsyContext*) apc->model_priv_data; |
315 av_freep(&pctx->ch); | 438 av_freep(&pctx->ch); |
316 av_freep(&apc->model_priv_data); | 439 av_freep(&apc->model_priv_data); |
317 } | 440 } |
318 | 441 |
442 static void lame_apply_block_type(AacPsyChannel *ctx, FFPsyWindowInfo *wi, int uselongblock) | |
443 { | |
444 int blocktype = ONLY_LONG_SEQUENCE; | |
445 if (uselongblock) { | |
446 if (ctx->next_window_seq == EIGHT_SHORT_SEQUENCE) | |
447 blocktype = LONG_STOP_SEQUENCE; | |
448 } else { | |
449 blocktype = EIGHT_SHORT_SEQUENCE; | |
450 if (ctx->next_window_seq == ONLY_LONG_SEQUENCE) | |
451 ctx->next_window_seq = LONG_START_SEQUENCE; | |
452 if (ctx->next_window_seq == LONG_STOP_SEQUENCE) | |
453 ctx->next_window_seq = EIGHT_SHORT_SEQUENCE; | |
454 } | |
455 | |
456 wi->window_type[0] = ctx->next_window_seq; | |
457 ctx->next_window_seq = blocktype; | |
458 } | |
459 | |
460 static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, | |
461 const int16_t *audio, const int16_t *la, | |
462 int channel, int prev_type) | |
463 { | |
464 AacPsyContext *pctx = (AacPsyContext*) ctx->model_priv_data; | |
465 AacPsyChannel *pch = &pctx->ch[channel]; | |
466 int grouping = 0; | |
467 int uselongblock = 1; | |
468 int attacks[AAC_NUM_BLOCKS_SHORT + 1] = { 0 }; | |
469 int i; | |
470 FFPsyWindowInfo wi; | |
471 | |
472 memset(&wi, 0, sizeof(wi)); | |
473 if (la) { | |
474 float hpfsmpl[AAC_BLOCK_SIZE_LONG]; | |
475 float const *pf = hpfsmpl; | |
476 float attack_intensity[(AAC_NUM_BLOCKS_SHORT + 1) * PSY_LAME_NUM_SUBBLOCKS]; | |
477 float energy_subshort[(AAC_NUM_BLOCKS_SHORT + 1) * PSY_LAME_NUM_SUBBLOCKS]; | |
478 float energy_short[AAC_NUM_BLOCKS_SHORT + 1] = { 0 }; | |
479 int chans = ctx->avctx->channels; | |
480 const int16_t *firbuf = la + (AAC_BLOCK_SIZE_SHORT/4 - PSY_LAME_FIR_LEN) * chans; | |
481 int j, att_sum = 0; | |
482 | |
483 /* LAME comment: apply high pass filter of fs/4 */ | |
484 for (i = 0; i < AAC_BLOCK_SIZE_LONG; i++) { | |
485 float sum1, sum2; | |
486 sum1 = firbuf[(i + ((PSY_LAME_FIR_LEN - 1) / 2)) * chans]; | |
487 sum2 = 0.0; | |
488 for (j = 0; j < ((PSY_LAME_FIR_LEN - 1) / 2) - 1; j += 2) { | |
489 sum1 += psy_fir_coeffs[j] * (firbuf[(i + j) * chans] + firbuf[(i + PSY_LAME_FIR_LEN - j) * chans]); | |
490 sum2 += psy_fir_coeffs[j + 1] * (firbuf[(i + j + 1) * chans] + firbuf[(i + PSY_LAME_FIR_LEN - j - 1) * chans]); | |
491 } | |
492 hpfsmpl[i] = sum1 + sum2; | |
493 } | |
494 | |
495 /* Calculate the energies of each sub-shortblock */ | |
496 for (i = 0; i < PSY_LAME_NUM_SUBBLOCKS; i++) { | |
497 energy_subshort[i] = pch->prev_energy_subshort[i + ((AAC_NUM_BLOCKS_SHORT - 1) * PSY_LAME_NUM_SUBBLOCKS)]; | |
498 assert(pch->prev_energy_subshort[i + ((AAC_NUM_BLOCKS_SHORT - 2) * PSY_LAME_NUM_SUBBLOCKS + 1)] > 0); | |
499 attack_intensity[i] = energy_subshort[i] / pch->prev_energy_subshort[i + ((AAC_NUM_BLOCKS_SHORT - 2) * PSY_LAME_NUM_SUBBLOCKS + 1)]; | |
500 energy_short[0] += energy_subshort[i]; | |
501 } | |
502 | |
503 for (i = 0; i < AAC_NUM_BLOCKS_SHORT * PSY_LAME_NUM_SUBBLOCKS; i++) { | |
504 float const *const pfe = pf + AAC_BLOCK_SIZE_LONG / (AAC_NUM_BLOCKS_SHORT * PSY_LAME_NUM_SUBBLOCKS); | |
505 float p = 1.0f; | |
506 for (; pf < pfe; pf++) | |
507 if (p < fabsf(*pf)) | |
508 p = fabsf(*pf); | |
509 pch->prev_energy_subshort[i] = energy_subshort[i + PSY_LAME_NUM_SUBBLOCKS] = p; | |
510 energy_short[1 + i / PSY_LAME_NUM_SUBBLOCKS] += p; | |
511 /* FIXME: The indexes below are [i + 3 - 2] in the LAME source. | |
512 * Obviously the 3 and 2 have some significance, or this would be just [i + 1] | |
513 * (which is what we use here). What the 3 stands for is ambigious, as it is both | |
514 * number of short blocks, and the number of sub-short blocks. | |
515 * It seems that LAME is comparing each sub-block to sub-block + 1 in the | |
516 * previous block. | |
517 */ | |
518 if (p > energy_subshort[i + 1]) | |
519 p = p / energy_subshort[i + 1]; | |
520 else if (energy_subshort[i + 1] > p * 10.0f) | |
521 p = energy_subshort[i + 1] / (p * 10.0f); | |
522 else | |
523 p = 0.0; | |
524 attack_intensity[i + PSY_LAME_NUM_SUBBLOCKS] = p; | |
525 } | |
526 | |
527 /* compare energy between sub-short blocks */ | |
528 for (i = 0; i < (AAC_NUM_BLOCKS_SHORT + 1) * PSY_LAME_NUM_SUBBLOCKS; i++) | |
529 if (!attacks[i / PSY_LAME_NUM_SUBBLOCKS]) | |
530 if (attack_intensity[i] > pch->attack_threshold) | |
531 attacks[i / PSY_LAME_NUM_SUBBLOCKS] = (i % PSY_LAME_NUM_SUBBLOCKS) + 1; | |
532 | |
533 /* should have energy change between short blocks, in order to avoid periodic signals */ | |
534 /* Good samples to show the effect are Trumpet test songs */ | |
535 /* GB: tuned (1) to avoid too many short blocks for test sample TRUMPET */ | |
536 /* RH: tuned (2) to let enough short blocks through for test sample FSOL and SNAPS */ | |
537 for (i = 1; i < AAC_NUM_BLOCKS_SHORT + 1; i++) { | |
538 float const u = energy_short[i - 1]; | |
539 float const v = energy_short[i]; | |
540 float const m = FFMAX(u, v); | |
541 if (m < 40000) { /* (2) */ | |
542 if (u < 1.7f * v && v < 1.7f * u) { /* (1) */ | |
543 if (i == 1 && attacks[0] < attacks[i]) | |
544 attacks[0] = 0; | |
545 attacks[i] = 0; | |
546 } | |
547 } | |
548 att_sum += attacks[i]; | |
549 } | |
550 | |
551 if (attacks[0] <= pch->prev_attack) | |
552 attacks[0] = 0; | |
553 | |
554 att_sum += attacks[0]; | |
555 /* 3 below indicates the previous attack happened in the last sub-block of the previous sequence */ | |
556 if (pch->prev_attack == 3 || att_sum) { | |
557 uselongblock = 0; | |
558 | |
559 if (attacks[1] && attacks[0]) | |
560 attacks[1] = 0; | |
561 if (attacks[2] && attacks[1]) | |
562 attacks[2] = 0; | |
563 if (attacks[3] && attacks[2]) | |
564 attacks[3] = 0; | |
565 if (attacks[4] && attacks[3]) | |
566 attacks[4] = 0; | |
567 if (attacks[5] && attacks[4]) | |
568 attacks[5] = 0; | |
569 if (attacks[6] && attacks[5]) | |
570 attacks[6] = 0; | |
571 if (attacks[7] && attacks[6]) | |
572 attacks[7] = 0; | |
573 if (attacks[8] && attacks[7]) | |
574 attacks[8] = 0; | |
575 } | |
576 } else { | |
577 /* We have no lookahead info, so just use same type as the previous sequence. */ | |
578 uselongblock = !(prev_type == EIGHT_SHORT_SEQUENCE); | |
579 } | |
580 | |
581 lame_apply_block_type(pch, &wi, uselongblock); | |
582 | |
583 wi.window_type[1] = prev_type; | |
584 if (wi.window_type[0] != EIGHT_SHORT_SEQUENCE) { | |
585 wi.num_windows = 1; | |
586 wi.grouping[0] = 1; | |
587 if (wi.window_type[0] == LONG_START_SEQUENCE) | |
588 wi.window_shape = 0; | |
589 else | |
590 wi.window_shape = 1; | |
591 } else { | |
592 int lastgrp = 0; | |
593 | |
594 wi.num_windows = 8; | |
595 wi.window_shape = 0; | |
596 for (i = 0; i < 8; i++) { | |
597 if (!((pch->next_grouping >> i) & 1)) | |
598 lastgrp = i; | |
599 wi.grouping[lastgrp]++; | |
600 } | |
601 } | |
602 | |
603 /* Determine grouping, based on the location of the first attack, and save for | |
604 * the next frame. | |
605 * FIXME: Move this to analysis. | |
606 * TODO: Tune groupings depending on attack location | |
607 * TODO: Handle more than one attack in a group | |
608 */ | |
609 for (i = 0; i < 9; i++) { | |
610 if (attacks[i]) { | |
611 grouping = i; | |
612 break; | |
613 } | |
614 } | |
615 pch->next_grouping = window_grouping[grouping]; | |
616 | |
617 pch->prev_attack = attacks[8]; | |
618 | |
619 return wi; | |
620 } | |
319 | 621 |
320 const FFPsyModel ff_aac_psy_model = | 622 const FFPsyModel ff_aac_psy_model = |
321 { | 623 { |
322 .name = "3GPP TS 26.403-inspired model", | 624 .name = "3GPP TS 26.403-inspired model", |
323 .init = psy_3gpp_init, | 625 .init = psy_3gpp_init, |
324 .window = psy_3gpp_window, | 626 .window = psy_lame_window, |
325 .analyze = psy_3gpp_analyze, | 627 .analyze = psy_3gpp_analyze, |
326 .end = psy_3gpp_end, | 628 .end = psy_3gpp_end, |
327 }; | 629 }; |