Mercurial > libavcodec.hg
comparison mpegaudiodec.c @ 2489:2e5ae040e92b libavcodec
optimizing imdct36()
author | michael |
---|---|
date | Tue, 01 Feb 2005 21:27:18 +0000 |
parents | dfdb6bf4b90f |
children | c2c642ad6ef4 |
comparison
equal
deleted
inserted
replaced
2488:8dd4672b9f74 | 2489:2e5ae040e92b |
---|---|
66 #define MUL64(a,b) ((int64_t)(a) * (int64_t)(b)) | 66 #define MUL64(a,b) ((int64_t)(a) * (int64_t)(b)) |
67 #define FIX(a) ((int)((a) * FRAC_ONE)) | 67 #define FIX(a) ((int)((a) * FRAC_ONE)) |
68 /* WARNING: only correct for posititive numbers */ | 68 /* WARNING: only correct for posititive numbers */ |
69 #define FIXR(a) ((int)((a) * FRAC_ONE + 0.5)) | 69 #define FIXR(a) ((int)((a) * FRAC_ONE + 0.5)) |
70 #define FRAC_RND(a) (((a) + (FRAC_ONE/2)) >> FRAC_BITS) | 70 #define FRAC_RND(a) (((a) + (FRAC_ONE/2)) >> FRAC_BITS) |
71 | |
72 #define FIXHR(a) ((int)((a) * (1LL<<32) + 0.5)) | |
73 //#define MULH(a,b) (((int64_t)(a) * (int64_t)(b))>>32) //gcc 3.4 creates an incredibly bloated mess out of this | |
74 static always_inline int MULH(int a, int b){ | |
75 return ((int64_t)(a) * (int64_t)(b))>>32; | |
76 } | |
71 | 77 |
72 #if FRAC_BITS <= 15 | 78 #if FRAC_BITS <= 15 |
73 typedef int16_t MPA_INT; | 79 typedef int16_t MPA_INT; |
74 #else | 80 #else |
75 typedef int32_t MPA_INT; | 81 typedef int32_t MPA_INT; |
491 csa_table_float[i][0] = cs; | 497 csa_table_float[i][0] = cs; |
492 csa_table_float[i][1] = ca; | 498 csa_table_float[i][1] = ca; |
493 csa_table_float[i][2] = ca + cs; | 499 csa_table_float[i][2] = ca + cs; |
494 csa_table_float[i][3] = ca - cs; | 500 csa_table_float[i][3] = ca - cs; |
495 // printf("%d %d %d %d\n", FIX(cs), FIX(cs-1), FIX(ca), FIX(cs)-FIX(ca)); | 501 // printf("%d %d %d %d\n", FIX(cs), FIX(cs-1), FIX(ca), FIX(cs)-FIX(ca)); |
502 // av_log(NULL, AV_LOG_DEBUG,"%f %f %f %f\n", cs, ca, ca+cs, ca-cs); | |
496 } | 503 } |
497 | 504 |
498 /* compute mdct windows */ | 505 /* compute mdct windows */ |
499 for(i=0;i<36;i++) { | 506 for(i=0;i<36;i++) { |
500 int v; | 507 for(j=0; j<4; j++){ |
501 v = FIXR(sin(M_PI * (i + 0.5) / 36.0)); | 508 double d; |
502 mdct_win[0][i] = v; | 509 if(j==2) continue; |
503 mdct_win[1][i] = v; | 510 |
504 mdct_win[3][i] = v; | 511 d= sin(M_PI * (i + 0.5) / 36.0); |
505 } | 512 if(j==1){ |
506 for(i=0;i<6;i++) { | 513 if (i>=30) d= 0; |
507 mdct_win[1][18 + i] = FIXR(1.0); | 514 else if(i>=24) d= sin(M_PI * (i - 18 + 0.5) / 12.0); |
508 mdct_win[1][24 + i] = FIXR(sin(M_PI * ((i + 6) + 0.5) / 12.0)); | 515 else if(i>=18) d= 1; |
509 mdct_win[1][30 + i] = FIXR(0.0); | 516 }else if(j==3){ |
510 | 517 if (i< 6) d= 0; |
511 mdct_win[3][i] = FIXR(0.0); | 518 else if(i< 12) d= sin(M_PI * (i - 6 + 0.5) / 12.0); |
512 mdct_win[3][6 + i] = FIXR(sin(M_PI * (i + 0.5) / 12.0)); | 519 else if(i< 18) d= 1; |
513 mdct_win[3][12 + i] = FIXR(1.0); | 520 } |
521 //merge last stage of imdct into the window coefficients | |
522 if (i/9 == 0) d*= 0.5 / cos(M_PI*(2*( i) +19)/72); | |
523 else if(i/9 == 1) d*= 0.5 / cos(M_PI*(2*(17 - i) +19)/72); | |
524 else if(i/9 == 2) d*= 0.5 / cos(M_PI*(2*( i) +19)/72); | |
525 else d*=-0.5 / cos(M_PI*(2*(17 - i) +19)/72); | |
526 mdct_win[j][i] = FIXHR((d / (1<<5))); | |
527 // av_log(NULL, AV_LOG_DEBUG, "%2d %d %f\n", i,j,d / (1<<5)); | |
528 } | |
514 } | 529 } |
515 | 530 |
516 for(i=0;i<12;i++) | 531 for(i=0;i<12;i++) |
517 mdct_win[2][i] = FIXR(sin(M_PI * (i + 0.5) / 12.0)); | 532 mdct_win[2][i] = FIXR(sin(M_PI * (i + 0.5) / 12.0)); |
518 | 533 |
1009 #undef C7 | 1024 #undef C7 |
1010 #undef C9 | 1025 #undef C9 |
1011 #undef C11 | 1026 #undef C11 |
1012 | 1027 |
1013 /* cos(pi*i/18) */ | 1028 /* cos(pi*i/18) */ |
1014 #define C1 FIXR(0.98480775301220805936) | 1029 #define C1 FIXHR(0.98480775301220805936/2) |
1015 #define C2 FIXR(0.93969262078590838405) | 1030 #define C2 FIXHR(0.93969262078590838405/2) |
1016 #define C3 FIXR(0.86602540378443864676) | 1031 #define C3 FIXHR(0.86602540378443864676/2) |
1017 #define C4 FIXR(0.76604444311897803520) | 1032 #define C4 FIXHR(0.76604444311897803520/2) |
1018 #define C5 FIXR(0.64278760968653932632) | 1033 #define C5 FIXHR(0.64278760968653932632/2) |
1019 #define C6 FIXR(0.5) | 1034 #define C6 FIXHR(0.5/2) |
1020 #define C7 FIXR(0.34202014332566873304) | 1035 #define C7 FIXHR(0.34202014332566873304/2) |
1021 #define C8 FIXR(0.17364817766693034885) | 1036 #define C8 FIXHR(0.17364817766693034885/2) |
1037 | |
1022 | 1038 |
1023 /* 0.5 / cos(pi*(2*i+1)/36) */ | 1039 /* 0.5 / cos(pi*(2*i+1)/36) */ |
1024 static const int icos36[9] = { | 1040 static const int icos36[9] = { |
1025 FIXR(0.50190991877167369479), | 1041 FIXR(0.50190991877167369479), |
1026 FIXR(0.51763809020504152469), | 1042 FIXR(0.51763809020504152469), |
1030 FIXR(0.87172339781054900991), | 1046 FIXR(0.87172339781054900991), |
1031 FIXR(1.18310079157624925896), | 1047 FIXR(1.18310079157624925896), |
1032 FIXR(1.93185165257813657349), | 1048 FIXR(1.93185165257813657349), |
1033 FIXR(5.73685662283492756461), | 1049 FIXR(5.73685662283492756461), |
1034 }; | 1050 }; |
1035 | |
1036 static const int icos72[18] = { | |
1037 /* 0.5 / cos(pi*(2*i+19)/72) */ | |
1038 FIXR(0.74009361646113053152), | |
1039 FIXR(0.82133981585229078570), | |
1040 FIXR(0.93057949835178895673), | |
1041 FIXR(1.08284028510010010928), | |
1042 FIXR(1.30656296487637652785), | |
1043 FIXR(1.66275476171152078719), | |
1044 FIXR(2.31011315767264929558), | |
1045 FIXR(3.83064878777019433457), | |
1046 FIXR(11.46279281302667383546), | |
1047 | |
1048 /* 0.5 / cos(pi*(2*(i + 18) +19)/72) */ | |
1049 FIXR(-0.67817085245462840086), | |
1050 FIXR(-0.63023620700513223342), | |
1051 FIXR(-0.59284452371708034528), | |
1052 FIXR(-0.56369097343317117734), | |
1053 FIXR(-0.54119610014619698439), | |
1054 FIXR(-0.52426456257040533932), | |
1055 FIXR(-0.51213975715725461845), | |
1056 FIXR(-0.50431448029007636036), | |
1057 FIXR(-0.50047634258165998492), | |
1058 }; | |
1059 | |
1060 /* using Lee like decomposition followed by hand coded 9 points DCT */ | 1051 /* using Lee like decomposition followed by hand coded 9 points DCT */ |
1061 static void imdct36(int *out, int *in) | 1052 static void imdct36(int *out, int *buf, int *in, int *win) |
1062 { | 1053 { |
1063 int i, j, t0, t1, t2, t3, s0, s1, s2, s3; | 1054 int i, j, t0, t1, t2, t3, s0, s1, s2, s3; |
1064 int tmp[18], *tmp1, *in1; | 1055 int tmp[18], *tmp1, *in1; |
1065 int64_t in3_3, in6_6; | |
1066 | 1056 |
1067 for(i=17;i>=1;i--) | 1057 for(i=17;i>=1;i--) |
1068 in[i] += in[i-1]; | 1058 in[i] += in[i-1]; |
1069 for(i=17;i>=3;i-=2) | 1059 for(i=17;i>=3;i-=2) |
1070 in[i] += in[i-2]; | 1060 in[i] += in[i-2]; |
1071 | 1061 |
1072 for(j=0;j<2;j++) { | 1062 for(j=0;j<2;j++) { |
1073 tmp1 = tmp + j; | 1063 tmp1 = tmp + j; |
1074 in1 = in + j; | 1064 in1 = in + j; |
1075 | 1065 #if 0 |
1076 in3_3 = MUL64(in1[2*3], C3); | 1066 //more accurate but slower |
1077 in6_6 = MUL64(in1[2*6], C6); | 1067 int64_t t0, t1, t2, t3; |
1078 | 1068 t2 = in1[2*4] + in1[2*8] - in1[2*2]; |
1079 tmp1[0] = FRAC_RND(MUL64(in1[2*1], C1) + in3_3 + | 1069 |
1080 MUL64(in1[2*5], C5) + MUL64(in1[2*7], C7)); | 1070 t3 = (in1[2*0] + (int64_t)(in1[2*6]>>1))<<32; |
1081 tmp1[2] = in1[2*0] + FRAC_RND(MUL64(in1[2*2], C2) + | 1071 t1 = in1[2*0] - in1[2*6]; |
1082 MUL64(in1[2*4], C4) + in6_6 + | 1072 tmp1[ 6] = t1 - (t2>>1); |
1083 MUL64(in1[2*8], C8)); | 1073 tmp1[16] = t1 + t2; |
1084 tmp1[4] = FRAC_RND(MUL64(in1[2*1] - in1[2*5] - in1[2*7], C3)); | 1074 |
1085 tmp1[6] = FRAC_RND(MUL64(in1[2*2] - in1[2*4] - in1[2*8], C6)) - | 1075 t0 = MUL64(2*(in1[2*2] + in1[2*4]), C2); |
1086 in1[2*6] + in1[2*0]; | 1076 t1 = MUL64( in1[2*4] - in1[2*8] , -2*C8); |
1087 tmp1[8] = FRAC_RND(MUL64(in1[2*1], C5) - in3_3 - | 1077 t2 = MUL64(2*(in1[2*2] + in1[2*8]), -C4); |
1088 MUL64(in1[2*5], C7) + MUL64(in1[2*7], C1)); | 1078 |
1089 tmp1[10] = in1[2*0] + FRAC_RND(MUL64(-in1[2*2], C8) - | 1079 tmp1[10] = (t3 - t0 - t2) >> 32; |
1090 MUL64(in1[2*4], C2) + in6_6 + | 1080 tmp1[ 2] = (t3 + t0 + t1) >> 32; |
1091 MUL64(in1[2*8], C4)); | 1081 tmp1[14] = (t3 + t2 - t1) >> 32; |
1092 tmp1[12] = FRAC_RND(MUL64(in1[2*1], C7) - in3_3 + | 1082 |
1093 MUL64(in1[2*5], C1) - | 1083 tmp1[ 4] = MULH(2*(in1[2*5] + in1[2*7] - in1[2*1]), -C3); |
1094 MUL64(in1[2*7], C5)); | 1084 t2 = MUL64(2*(in1[2*1] + in1[2*5]), C1); |
1095 tmp1[14] = in1[2*0] + FRAC_RND(MUL64(-in1[2*2], C4) + | 1085 t3 = MUL64( in1[2*5] - in1[2*7] , -2*C7); |
1096 MUL64(in1[2*4], C8) + in6_6 - | 1086 t0 = MUL64(2*in1[2*3], C3); |
1097 MUL64(in1[2*8], C2)); | 1087 |
1098 tmp1[16] = in1[2*0] - in1[2*2] + in1[2*4] - in1[2*6] + in1[2*8]; | 1088 t1 = MUL64(2*(in1[2*1] + in1[2*7]), -C5); |
1089 | |
1090 tmp1[ 0] = (t2 + t3 + t0) >> 32; | |
1091 tmp1[12] = (t2 + t1 - t0) >> 32; | |
1092 tmp1[ 8] = (t3 - t1 - t0) >> 32; | |
1093 #else | |
1094 t2 = in1[2*4] + in1[2*8] - in1[2*2]; | |
1095 | |
1096 t3 = in1[2*0] + (in1[2*6]>>1); | |
1097 t1 = in1[2*0] - in1[2*6]; | |
1098 tmp1[ 6] = t1 - (t2>>1); | |
1099 tmp1[16] = t1 + t2; | |
1100 | |
1101 t0 = MULH(2*(in1[2*2] + in1[2*4]), C2); | |
1102 t1 = MULH( in1[2*4] - in1[2*8] , -2*C8); | |
1103 t2 = MULH(2*(in1[2*2] + in1[2*8]), -C4); | |
1104 | |
1105 tmp1[10] = t3 - t0 - t2; | |
1106 tmp1[ 2] = t3 + t0 + t1; | |
1107 tmp1[14] = t3 + t2 - t1; | |
1108 | |
1109 tmp1[ 4] = MULH(2*(in1[2*5] + in1[2*7] - in1[2*1]), -C3); | |
1110 t2 = MULH(2*(in1[2*1] + in1[2*5]), C1); | |
1111 t3 = MULH( in1[2*5] - in1[2*7] , -2*C7); | |
1112 t0 = MULH(2*in1[2*3], C3); | |
1113 | |
1114 t1 = MULH(2*(in1[2*1] + in1[2*7]), -C5); | |
1115 | |
1116 tmp1[ 0] = t2 + t3 + t0; | |
1117 tmp1[12] = t2 + t1 - t0; | |
1118 tmp1[ 8] = t3 - t1 - t0; | |
1119 #endif | |
1099 } | 1120 } |
1100 | 1121 |
1101 i = 0; | 1122 i = 0; |
1102 for(j=0;j<4;j++) { | 1123 for(j=0;j<4;j++) { |
1103 t0 = tmp[i]; | 1124 t0 = tmp[i]; |
1108 t2 = tmp[i + 1]; | 1129 t2 = tmp[i + 1]; |
1109 t3 = tmp[i + 3]; | 1130 t3 = tmp[i + 3]; |
1110 s1 = MULL(t3 + t2, icos36[j]); | 1131 s1 = MULL(t3 + t2, icos36[j]); |
1111 s3 = MULL(t3 - t2, icos36[8 - j]); | 1132 s3 = MULL(t3 - t2, icos36[8 - j]); |
1112 | 1133 |
1113 t0 = MULL(s0 + s1, icos72[9 + 8 - j]); | 1134 t0 = (s0 + s1) << 5; |
1114 t1 = MULL(s0 - s1, icos72[8 - j]); | 1135 t1 = (s0 - s1) << 5; |
1115 out[18 + 9 + j] = t0; | 1136 out[(9 + j)*SBLIMIT] = -MULH(t1, win[9 + j]) + buf[9 + j]; |
1116 out[18 + 8 - j] = t0; | 1137 out[(8 - j)*SBLIMIT] = MULH(t1, win[8 - j]) + buf[8 - j]; |
1117 out[9 + j] = -t1; | 1138 buf[9 + j] = MULH(t0, win[18 + 9 + j]); |
1118 out[8 - j] = t1; | 1139 buf[8 - j] = MULH(t0, win[18 + 8 - j]); |
1119 | 1140 |
1120 t0 = MULL(s2 + s3, icos72[9+j]); | 1141 t0 = (s2 + s3) << 5; |
1121 t1 = MULL(s2 - s3, icos72[j]); | 1142 t1 = (s2 - s3) << 5; |
1122 out[18 + 9 + (8 - j)] = t0; | 1143 out[(9 + 8 - j)*SBLIMIT] = -MULH(t1, win[9 + 8 - j]) + buf[9 + 8 - j]; |
1123 out[18 + j] = t0; | 1144 out[( j)*SBLIMIT] = MULH(t1, win[ j]) + buf[ j]; |
1124 out[9 + (8 - j)] = -t1; | 1145 buf[9 + 8 - j] = MULH(t0, win[18 + 9 + 8 - j]); |
1125 out[j] = t1; | 1146 buf[ + j] = MULH(t0, win[18 + j]); |
1126 i += 4; | 1147 i += 4; |
1127 } | 1148 } |
1128 | 1149 |
1129 s0 = tmp[16]; | 1150 s0 = tmp[16]; |
1130 s1 = MULL(tmp[17], icos36[4]); | 1151 s1 = MULL(tmp[17], icos36[4]); |
1131 t0 = MULL(s0 + s1, icos72[9 + 4]); | 1152 t0 = (s0 + s1) << 5; |
1132 t1 = MULL(s0 - s1, icos72[4]); | 1153 t1 = (s0 - s1) << 5; |
1133 out[18 + 9 + 4] = t0; | 1154 out[(9 + 4)*SBLIMIT] = -MULH(t1, win[9 + 4]) + buf[9 + 4]; |
1134 out[18 + 8 - 4] = t0; | 1155 out[(8 - 4)*SBLIMIT] = MULH(t1, win[8 - 4]) + buf[8 - 4]; |
1135 out[9 + 4] = -t1; | 1156 buf[9 + 4] = MULH(t0, win[18 + 9 + 4]); |
1136 out[8 - 4] = t1; | 1157 buf[8 - 4] = MULH(t0, win[18 + 8 - 4]); |
1137 } | 1158 } |
1138 | 1159 |
1139 /* header decoding. MUST check the header before because no | 1160 /* header decoding. MUST check the header before because no |
1140 consistency check is done there. Return 1 if free format found and | 1161 consistency check is done there. Return 1 if free format found and |
1141 that the frame size must be computed externally */ | 1162 that the frame size must be computed externally */ |
2062 } | 2083 } |
2063 | 2084 |
2064 buf = mdct_buf; | 2085 buf = mdct_buf; |
2065 ptr = g->sb_hybrid; | 2086 ptr = g->sb_hybrid; |
2066 for(j=0;j<mdct_long_end;j++) { | 2087 for(j=0;j<mdct_long_end;j++) { |
2067 imdct36(out, ptr); | |
2068 /* apply window & overlap with previous buffer */ | 2088 /* apply window & overlap with previous buffer */ |
2069 out_ptr = sb_samples + j; | 2089 out_ptr = sb_samples + j; |
2070 /* select window */ | 2090 /* select window */ |
2071 if (g->switch_point && j < 2) | 2091 if (g->switch_point && j < 2) |
2072 win1 = mdct_win[0]; | 2092 win1 = mdct_win[0]; |
2073 else | 2093 else |
2074 win1 = mdct_win[g->block_type]; | 2094 win1 = mdct_win[g->block_type]; |
2075 /* select frequency inversion */ | 2095 /* select frequency inversion */ |
2076 win = win1 + ((4 * 36) & -(j & 1)); | 2096 win = win1 + ((4 * 36) & -(j & 1)); |
2077 for(i=0;i<18;i++) { | 2097 imdct36(out_ptr, buf, ptr, win); |
2078 *out_ptr = MULL(out[i], win[i]) + buf[i]; | 2098 out_ptr += 18*SBLIMIT; |
2079 buf[i] = MULL(out[i + 18], win[i + 18]); | |
2080 out_ptr += SBLIMIT; | |
2081 } | |
2082 ptr += 18; | 2099 ptr += 18; |
2083 buf += 18; | 2100 buf += 18; |
2084 } | 2101 } |
2085 for(j=mdct_long_end;j<sblimit;j++) { | 2102 for(j=mdct_long_end;j<sblimit;j++) { |
2086 for(i=0;i<6;i++) { | 2103 for(i=0;i<6;i++) { |