comparison mpegaudiodec.c @ 2489:2e5ae040e92b libavcodec

optimizing imdct36()
author michael
date Tue, 01 Feb 2005 21:27:18 +0000
parents dfdb6bf4b90f
children c2c642ad6ef4
comparison
equal deleted inserted replaced
2488:8dd4672b9f74 2489:2e5ae040e92b
66 #define MUL64(a,b) ((int64_t)(a) * (int64_t)(b)) 66 #define MUL64(a,b) ((int64_t)(a) * (int64_t)(b))
67 #define FIX(a) ((int)((a) * FRAC_ONE)) 67 #define FIX(a) ((int)((a) * FRAC_ONE))
68 /* WARNING: only correct for posititive numbers */ 68 /* WARNING: only correct for posititive numbers */
69 #define FIXR(a) ((int)((a) * FRAC_ONE + 0.5)) 69 #define FIXR(a) ((int)((a) * FRAC_ONE + 0.5))
70 #define FRAC_RND(a) (((a) + (FRAC_ONE/2)) >> FRAC_BITS) 70 #define FRAC_RND(a) (((a) + (FRAC_ONE/2)) >> FRAC_BITS)
71
72 #define FIXHR(a) ((int)((a) * (1LL<<32) + 0.5))
73 //#define MULH(a,b) (((int64_t)(a) * (int64_t)(b))>>32) //gcc 3.4 creates an incredibly bloated mess out of this
74 static always_inline int MULH(int a, int b){
75 return ((int64_t)(a) * (int64_t)(b))>>32;
76 }
71 77
72 #if FRAC_BITS <= 15 78 #if FRAC_BITS <= 15
73 typedef int16_t MPA_INT; 79 typedef int16_t MPA_INT;
74 #else 80 #else
75 typedef int32_t MPA_INT; 81 typedef int32_t MPA_INT;
491 csa_table_float[i][0] = cs; 497 csa_table_float[i][0] = cs;
492 csa_table_float[i][1] = ca; 498 csa_table_float[i][1] = ca;
493 csa_table_float[i][2] = ca + cs; 499 csa_table_float[i][2] = ca + cs;
494 csa_table_float[i][3] = ca - cs; 500 csa_table_float[i][3] = ca - cs;
495 // printf("%d %d %d %d\n", FIX(cs), FIX(cs-1), FIX(ca), FIX(cs)-FIX(ca)); 501 // printf("%d %d %d %d\n", FIX(cs), FIX(cs-1), FIX(ca), FIX(cs)-FIX(ca));
502 // av_log(NULL, AV_LOG_DEBUG,"%f %f %f %f\n", cs, ca, ca+cs, ca-cs);
496 } 503 }
497 504
498 /* compute mdct windows */ 505 /* compute mdct windows */
499 for(i=0;i<36;i++) { 506 for(i=0;i<36;i++) {
500 int v; 507 for(j=0; j<4; j++){
501 v = FIXR(sin(M_PI * (i + 0.5) / 36.0)); 508 double d;
502 mdct_win[0][i] = v; 509 if(j==2) continue;
503 mdct_win[1][i] = v; 510
504 mdct_win[3][i] = v; 511 d= sin(M_PI * (i + 0.5) / 36.0);
505 } 512 if(j==1){
506 for(i=0;i<6;i++) { 513 if (i>=30) d= 0;
507 mdct_win[1][18 + i] = FIXR(1.0); 514 else if(i>=24) d= sin(M_PI * (i - 18 + 0.5) / 12.0);
508 mdct_win[1][24 + i] = FIXR(sin(M_PI * ((i + 6) + 0.5) / 12.0)); 515 else if(i>=18) d= 1;
509 mdct_win[1][30 + i] = FIXR(0.0); 516 }else if(j==3){
510 517 if (i< 6) d= 0;
511 mdct_win[3][i] = FIXR(0.0); 518 else if(i< 12) d= sin(M_PI * (i - 6 + 0.5) / 12.0);
512 mdct_win[3][6 + i] = FIXR(sin(M_PI * (i + 0.5) / 12.0)); 519 else if(i< 18) d= 1;
513 mdct_win[3][12 + i] = FIXR(1.0); 520 }
521 //merge last stage of imdct into the window coefficients
522 if (i/9 == 0) d*= 0.5 / cos(M_PI*(2*( i) +19)/72);
523 else if(i/9 == 1) d*= 0.5 / cos(M_PI*(2*(17 - i) +19)/72);
524 else if(i/9 == 2) d*= 0.5 / cos(M_PI*(2*( i) +19)/72);
525 else d*=-0.5 / cos(M_PI*(2*(17 - i) +19)/72);
526 mdct_win[j][i] = FIXHR((d / (1<<5)));
527 // av_log(NULL, AV_LOG_DEBUG, "%2d %d %f\n", i,j,d / (1<<5));
528 }
514 } 529 }
515 530
516 for(i=0;i<12;i++) 531 for(i=0;i<12;i++)
517 mdct_win[2][i] = FIXR(sin(M_PI * (i + 0.5) / 12.0)); 532 mdct_win[2][i] = FIXR(sin(M_PI * (i + 0.5) / 12.0));
518 533
1009 #undef C7 1024 #undef C7
1010 #undef C9 1025 #undef C9
1011 #undef C11 1026 #undef C11
1012 1027
1013 /* cos(pi*i/18) */ 1028 /* cos(pi*i/18) */
1014 #define C1 FIXR(0.98480775301220805936) 1029 #define C1 FIXHR(0.98480775301220805936/2)
1015 #define C2 FIXR(0.93969262078590838405) 1030 #define C2 FIXHR(0.93969262078590838405/2)
1016 #define C3 FIXR(0.86602540378443864676) 1031 #define C3 FIXHR(0.86602540378443864676/2)
1017 #define C4 FIXR(0.76604444311897803520) 1032 #define C4 FIXHR(0.76604444311897803520/2)
1018 #define C5 FIXR(0.64278760968653932632) 1033 #define C5 FIXHR(0.64278760968653932632/2)
1019 #define C6 FIXR(0.5) 1034 #define C6 FIXHR(0.5/2)
1020 #define C7 FIXR(0.34202014332566873304) 1035 #define C7 FIXHR(0.34202014332566873304/2)
1021 #define C8 FIXR(0.17364817766693034885) 1036 #define C8 FIXHR(0.17364817766693034885/2)
1037
1022 1038
1023 /* 0.5 / cos(pi*(2*i+1)/36) */ 1039 /* 0.5 / cos(pi*(2*i+1)/36) */
1024 static const int icos36[9] = { 1040 static const int icos36[9] = {
1025 FIXR(0.50190991877167369479), 1041 FIXR(0.50190991877167369479),
1026 FIXR(0.51763809020504152469), 1042 FIXR(0.51763809020504152469),
1030 FIXR(0.87172339781054900991), 1046 FIXR(0.87172339781054900991),
1031 FIXR(1.18310079157624925896), 1047 FIXR(1.18310079157624925896),
1032 FIXR(1.93185165257813657349), 1048 FIXR(1.93185165257813657349),
1033 FIXR(5.73685662283492756461), 1049 FIXR(5.73685662283492756461),
1034 }; 1050 };
1035
1036 static const int icos72[18] = {
1037 /* 0.5 / cos(pi*(2*i+19)/72) */
1038 FIXR(0.74009361646113053152),
1039 FIXR(0.82133981585229078570),
1040 FIXR(0.93057949835178895673),
1041 FIXR(1.08284028510010010928),
1042 FIXR(1.30656296487637652785),
1043 FIXR(1.66275476171152078719),
1044 FIXR(2.31011315767264929558),
1045 FIXR(3.83064878777019433457),
1046 FIXR(11.46279281302667383546),
1047
1048 /* 0.5 / cos(pi*(2*(i + 18) +19)/72) */
1049 FIXR(-0.67817085245462840086),
1050 FIXR(-0.63023620700513223342),
1051 FIXR(-0.59284452371708034528),
1052 FIXR(-0.56369097343317117734),
1053 FIXR(-0.54119610014619698439),
1054 FIXR(-0.52426456257040533932),
1055 FIXR(-0.51213975715725461845),
1056 FIXR(-0.50431448029007636036),
1057 FIXR(-0.50047634258165998492),
1058 };
1059
1060 /* using Lee like decomposition followed by hand coded 9 points DCT */ 1051 /* using Lee like decomposition followed by hand coded 9 points DCT */
1061 static void imdct36(int *out, int *in) 1052 static void imdct36(int *out, int *buf, int *in, int *win)
1062 { 1053 {
1063 int i, j, t0, t1, t2, t3, s0, s1, s2, s3; 1054 int i, j, t0, t1, t2, t3, s0, s1, s2, s3;
1064 int tmp[18], *tmp1, *in1; 1055 int tmp[18], *tmp1, *in1;
1065 int64_t in3_3, in6_6;
1066 1056
1067 for(i=17;i>=1;i--) 1057 for(i=17;i>=1;i--)
1068 in[i] += in[i-1]; 1058 in[i] += in[i-1];
1069 for(i=17;i>=3;i-=2) 1059 for(i=17;i>=3;i-=2)
1070 in[i] += in[i-2]; 1060 in[i] += in[i-2];
1071 1061
1072 for(j=0;j<2;j++) { 1062 for(j=0;j<2;j++) {
1073 tmp1 = tmp + j; 1063 tmp1 = tmp + j;
1074 in1 = in + j; 1064 in1 = in + j;
1075 1065 #if 0
1076 in3_3 = MUL64(in1[2*3], C3); 1066 //more accurate but slower
1077 in6_6 = MUL64(in1[2*6], C6); 1067 int64_t t0, t1, t2, t3;
1078 1068 t2 = in1[2*4] + in1[2*8] - in1[2*2];
1079 tmp1[0] = FRAC_RND(MUL64(in1[2*1], C1) + in3_3 + 1069
1080 MUL64(in1[2*5], C5) + MUL64(in1[2*7], C7)); 1070 t3 = (in1[2*0] + (int64_t)(in1[2*6]>>1))<<32;
1081 tmp1[2] = in1[2*0] + FRAC_RND(MUL64(in1[2*2], C2) + 1071 t1 = in1[2*0] - in1[2*6];
1082 MUL64(in1[2*4], C4) + in6_6 + 1072 tmp1[ 6] = t1 - (t2>>1);
1083 MUL64(in1[2*8], C8)); 1073 tmp1[16] = t1 + t2;
1084 tmp1[4] = FRAC_RND(MUL64(in1[2*1] - in1[2*5] - in1[2*7], C3)); 1074
1085 tmp1[6] = FRAC_RND(MUL64(in1[2*2] - in1[2*4] - in1[2*8], C6)) - 1075 t0 = MUL64(2*(in1[2*2] + in1[2*4]), C2);
1086 in1[2*6] + in1[2*0]; 1076 t1 = MUL64( in1[2*4] - in1[2*8] , -2*C8);
1087 tmp1[8] = FRAC_RND(MUL64(in1[2*1], C5) - in3_3 - 1077 t2 = MUL64(2*(in1[2*2] + in1[2*8]), -C4);
1088 MUL64(in1[2*5], C7) + MUL64(in1[2*7], C1)); 1078
1089 tmp1[10] = in1[2*0] + FRAC_RND(MUL64(-in1[2*2], C8) - 1079 tmp1[10] = (t3 - t0 - t2) >> 32;
1090 MUL64(in1[2*4], C2) + in6_6 + 1080 tmp1[ 2] = (t3 + t0 + t1) >> 32;
1091 MUL64(in1[2*8], C4)); 1081 tmp1[14] = (t3 + t2 - t1) >> 32;
1092 tmp1[12] = FRAC_RND(MUL64(in1[2*1], C7) - in3_3 + 1082
1093 MUL64(in1[2*5], C1) - 1083 tmp1[ 4] = MULH(2*(in1[2*5] + in1[2*7] - in1[2*1]), -C3);
1094 MUL64(in1[2*7], C5)); 1084 t2 = MUL64(2*(in1[2*1] + in1[2*5]), C1);
1095 tmp1[14] = in1[2*0] + FRAC_RND(MUL64(-in1[2*2], C4) + 1085 t3 = MUL64( in1[2*5] - in1[2*7] , -2*C7);
1096 MUL64(in1[2*4], C8) + in6_6 - 1086 t0 = MUL64(2*in1[2*3], C3);
1097 MUL64(in1[2*8], C2)); 1087
1098 tmp1[16] = in1[2*0] - in1[2*2] + in1[2*4] - in1[2*6] + in1[2*8]; 1088 t1 = MUL64(2*(in1[2*1] + in1[2*7]), -C5);
1089
1090 tmp1[ 0] = (t2 + t3 + t0) >> 32;
1091 tmp1[12] = (t2 + t1 - t0) >> 32;
1092 tmp1[ 8] = (t3 - t1 - t0) >> 32;
1093 #else
1094 t2 = in1[2*4] + in1[2*8] - in1[2*2];
1095
1096 t3 = in1[2*0] + (in1[2*6]>>1);
1097 t1 = in1[2*0] - in1[2*6];
1098 tmp1[ 6] = t1 - (t2>>1);
1099 tmp1[16] = t1 + t2;
1100
1101 t0 = MULH(2*(in1[2*2] + in1[2*4]), C2);
1102 t1 = MULH( in1[2*4] - in1[2*8] , -2*C8);
1103 t2 = MULH(2*(in1[2*2] + in1[2*8]), -C4);
1104
1105 tmp1[10] = t3 - t0 - t2;
1106 tmp1[ 2] = t3 + t0 + t1;
1107 tmp1[14] = t3 + t2 - t1;
1108
1109 tmp1[ 4] = MULH(2*(in1[2*5] + in1[2*7] - in1[2*1]), -C3);
1110 t2 = MULH(2*(in1[2*1] + in1[2*5]), C1);
1111 t3 = MULH( in1[2*5] - in1[2*7] , -2*C7);
1112 t0 = MULH(2*in1[2*3], C3);
1113
1114 t1 = MULH(2*(in1[2*1] + in1[2*7]), -C5);
1115
1116 tmp1[ 0] = t2 + t3 + t0;
1117 tmp1[12] = t2 + t1 - t0;
1118 tmp1[ 8] = t3 - t1 - t0;
1119 #endif
1099 } 1120 }
1100 1121
1101 i = 0; 1122 i = 0;
1102 for(j=0;j<4;j++) { 1123 for(j=0;j<4;j++) {
1103 t0 = tmp[i]; 1124 t0 = tmp[i];
1108 t2 = tmp[i + 1]; 1129 t2 = tmp[i + 1];
1109 t3 = tmp[i + 3]; 1130 t3 = tmp[i + 3];
1110 s1 = MULL(t3 + t2, icos36[j]); 1131 s1 = MULL(t3 + t2, icos36[j]);
1111 s3 = MULL(t3 - t2, icos36[8 - j]); 1132 s3 = MULL(t3 - t2, icos36[8 - j]);
1112 1133
1113 t0 = MULL(s0 + s1, icos72[9 + 8 - j]); 1134 t0 = (s0 + s1) << 5;
1114 t1 = MULL(s0 - s1, icos72[8 - j]); 1135 t1 = (s0 - s1) << 5;
1115 out[18 + 9 + j] = t0; 1136 out[(9 + j)*SBLIMIT] = -MULH(t1, win[9 + j]) + buf[9 + j];
1116 out[18 + 8 - j] = t0; 1137 out[(8 - j)*SBLIMIT] = MULH(t1, win[8 - j]) + buf[8 - j];
1117 out[9 + j] = -t1; 1138 buf[9 + j] = MULH(t0, win[18 + 9 + j]);
1118 out[8 - j] = t1; 1139 buf[8 - j] = MULH(t0, win[18 + 8 - j]);
1119 1140
1120 t0 = MULL(s2 + s3, icos72[9+j]); 1141 t0 = (s2 + s3) << 5;
1121 t1 = MULL(s2 - s3, icos72[j]); 1142 t1 = (s2 - s3) << 5;
1122 out[18 + 9 + (8 - j)] = t0; 1143 out[(9 + 8 - j)*SBLIMIT] = -MULH(t1, win[9 + 8 - j]) + buf[9 + 8 - j];
1123 out[18 + j] = t0; 1144 out[( j)*SBLIMIT] = MULH(t1, win[ j]) + buf[ j];
1124 out[9 + (8 - j)] = -t1; 1145 buf[9 + 8 - j] = MULH(t0, win[18 + 9 + 8 - j]);
1125 out[j] = t1; 1146 buf[ + j] = MULH(t0, win[18 + j]);
1126 i += 4; 1147 i += 4;
1127 } 1148 }
1128 1149
1129 s0 = tmp[16]; 1150 s0 = tmp[16];
1130 s1 = MULL(tmp[17], icos36[4]); 1151 s1 = MULL(tmp[17], icos36[4]);
1131 t0 = MULL(s0 + s1, icos72[9 + 4]); 1152 t0 = (s0 + s1) << 5;
1132 t1 = MULL(s0 - s1, icos72[4]); 1153 t1 = (s0 - s1) << 5;
1133 out[18 + 9 + 4] = t0; 1154 out[(9 + 4)*SBLIMIT] = -MULH(t1, win[9 + 4]) + buf[9 + 4];
1134 out[18 + 8 - 4] = t0; 1155 out[(8 - 4)*SBLIMIT] = MULH(t1, win[8 - 4]) + buf[8 - 4];
1135 out[9 + 4] = -t1; 1156 buf[9 + 4] = MULH(t0, win[18 + 9 + 4]);
1136 out[8 - 4] = t1; 1157 buf[8 - 4] = MULH(t0, win[18 + 8 - 4]);
1137 } 1158 }
1138 1159
1139 /* header decoding. MUST check the header before because no 1160 /* header decoding. MUST check the header before because no
1140 consistency check is done there. Return 1 if free format found and 1161 consistency check is done there. Return 1 if free format found and
1141 that the frame size must be computed externally */ 1162 that the frame size must be computed externally */
2062 } 2083 }
2063 2084
2064 buf = mdct_buf; 2085 buf = mdct_buf;
2065 ptr = g->sb_hybrid; 2086 ptr = g->sb_hybrid;
2066 for(j=0;j<mdct_long_end;j++) { 2087 for(j=0;j<mdct_long_end;j++) {
2067 imdct36(out, ptr);
2068 /* apply window & overlap with previous buffer */ 2088 /* apply window & overlap with previous buffer */
2069 out_ptr = sb_samples + j; 2089 out_ptr = sb_samples + j;
2070 /* select window */ 2090 /* select window */
2071 if (g->switch_point && j < 2) 2091 if (g->switch_point && j < 2)
2072 win1 = mdct_win[0]; 2092 win1 = mdct_win[0];
2073 else 2093 else
2074 win1 = mdct_win[g->block_type]; 2094 win1 = mdct_win[g->block_type];
2075 /* select frequency inversion */ 2095 /* select frequency inversion */
2076 win = win1 + ((4 * 36) & -(j & 1)); 2096 win = win1 + ((4 * 36) & -(j & 1));
2077 for(i=0;i<18;i++) { 2097 imdct36(out_ptr, buf, ptr, win);
2078 *out_ptr = MULL(out[i], win[i]) + buf[i]; 2098 out_ptr += 18*SBLIMIT;
2079 buf[i] = MULL(out[i + 18], win[i + 18]);
2080 out_ptr += SBLIMIT;
2081 }
2082 ptr += 18; 2099 ptr += 18;
2083 buf += 18; 2100 buf += 18;
2084 } 2101 }
2085 for(j=mdct_long_end;j<sblimit;j++) { 2102 for(j=mdct_long_end;j<sblimit;j++) {
2086 for(i=0;i<6;i++) { 2103 for(i=0;i<6;i++) {