comparison liba52/resample_c.c @ 3569:d14494d84c29

MMX opt
author michael
date Mon, 17 Dec 2001 21:19:57 +0000
parents 9e1e88b3ca18
children 8600f40003de
comparison
equal deleted inserted replaced
3568:454dfb7787d7 3569:d14494d84c29
4 4
5 // a52_resample_init should find the requested converter (from type flags -> 5 // a52_resample_init should find the requested converter (from type flags ->
6 // given number of channels) and set up some function pointers... 6 // given number of channels) and set up some function pointers...
7 7
8 // a52_resample() should do the conversion. 8 // a52_resample() should do the conversion.
9
10 // MMX optimizations from Michael Niedermayer (michaelni@gmx.at) (under GPL)
11
12 /* optimization TODO / NOTES
13 movntq is slightly faster (0.5% with the current test.c benchmark)
14 (but thats just test.c so that needs to be testd in reallity)
15 and it would mean (C / MMX2 / MMX / 3DNOW) versions
16 */
9 17
10 #include <inttypes.h> 18 #include <inttypes.h>
11 #include "a52.h" 19 #include "a52.h"
12 #include "../config.h" 20 #include "../config.h"
13 21
106 s16[5*i+2] = s16[5*i+3] = 0; 114 s16[5*i+2] = s16[5*i+3] = 0;
107 s16[5*i+4] = convert (f[i+256]); 115 s16[5*i+4] = convert (f[i+256]);
108 } 116 }
109 break; 117 break;
110 case A52_2F2R: 118 case A52_2F2R:
119 #ifdef HAVE_MMX
120 asm volatile(
121 "movl $-1024, %%esi \n\t"
122 "movq magicF2W, %%mm7 \n\t"
123 "1: \n\t"
124 "movq (%1, %%esi), %%mm0 \n\t"
125 "movq 8(%1, %%esi), %%mm1 \n\t"
126 "movq 1024(%1, %%esi), %%mm2 \n\t"
127 "movq 1032(%1, %%esi), %%mm3 \n\t"
128 "psubd %%mm7, %%mm0 \n\t"
129 "psubd %%mm7, %%mm1 \n\t"
130 "psubd %%mm7, %%mm2 \n\t"
131 "psubd %%mm7, %%mm3 \n\t"
132 "packssdw %%mm1, %%mm0 \n\t"
133 "packssdw %%mm3, %%mm2 \n\t"
134 "movq 2048(%1, %%esi), %%mm3 \n\t"
135 "movq 2056(%1, %%esi), %%mm4 \n\t"
136 "movq 3072(%1, %%esi), %%mm5 \n\t"
137 "movq 3080(%1, %%esi), %%mm6 \n\t"
138 "psubd %%mm7, %%mm3 \n\t"
139 "psubd %%mm7, %%mm4 \n\t"
140 "psubd %%mm7, %%mm5 \n\t"
141 "psubd %%mm7, %%mm6 \n\t"
142 "packssdw %%mm4, %%mm3 \n\t"
143 "packssdw %%mm6, %%mm5 \n\t"
144 "movq %%mm0, %%mm1 \n\t"
145 "movq %%mm3, %%mm4 \n\t"
146 "punpcklwd %%mm2, %%mm0 \n\t"
147 "punpckhwd %%mm2, %%mm1 \n\t"
148 "punpcklwd %%mm5, %%mm3 \n\t"
149 "punpckhwd %%mm5, %%mm4 \n\t"
150 "movq %%mm0, %%mm2 \n\t"
151 "movq %%mm1, %%mm5 \n\t"
152 "punpckldq %%mm3, %%mm0 \n\t"
153 "punpckhdq %%mm3, %%mm2 \n\t"
154 "punpckldq %%mm4, %%mm1 \n\t"
155 "punpckhdq %%mm4, %%mm5 \n\t"
156 "movq %%mm0, (%0, %%esi,2) \n\t"
157 "movq %%mm2, 8(%0, %%esi,2) \n\t"
158 "movq %%mm1, 16(%0, %%esi,2) \n\t"
159 "movq %%mm5, 24(%0, %%esi,2) \n\t"
160 "addl $16, %%esi \n\t"
161 " jnz 1b \n\t"
162 "emms \n\t"
163 :: "r" (s16+1024), "r" (f+256)
164 :"%esi", "memory"
165 );
166 #else
111 for (i = 0; i < 256; i++) { 167 for (i = 0; i < 256; i++) {
112 s16[4*i] = convert (f[i]); 168 s16[4*i] = convert (f[i]);
113 s16[4*i+1] = convert (f[i+256]); 169 s16[4*i+1] = convert (f[i+256]);
114 s16[4*i+2] = convert (f[i+512]); 170 s16[4*i+2] = convert (f[i+512]);
115 s16[4*i+3] = convert (f[i+768]); 171 s16[4*i+3] = convert (f[i+768]);
116 } 172 }
173 #endif
117 break; 174 break;
118 case A52_3F2R: 175 case A52_3F2R:
119 for (i = 0; i < 256; i++) { 176 for (i = 0; i < 256; i++) {
120 s16[5*i] = convert (f[i]); 177 s16[5*i] = convert (f[i]);
121 s16[5*i+1] = convert (f[i+512]); 178 s16[5*i+1] = convert (f[i+512]);
123 s16[5*i+3] = convert (f[i+1024]); 180 s16[5*i+3] = convert (f[i+1024]);
124 s16[5*i+4] = convert (f[i+256]); 181 s16[5*i+4] = convert (f[i+256]);
125 } 182 }
126 break; 183 break;
127 case A52_MONO | A52_LFE: 184 case A52_MONO | A52_LFE:
185 #ifdef HAVE_MMX
186 asm volatile(
187 "movl $-1024, %%esi \n\t"
188 "movq magicF2W, %%mm7 \n\t"
189 "pxor %%mm6, %%mm6 \n\t"
190 "1: \n\t"
191 "movq 1024(%1, %%esi), %%mm0 \n\t"
192 "movq 1032(%1, %%esi), %%mm1 \n\t"
193 "movq (%1, %%esi), %%mm2 \n\t"
194 "movq 8(%1, %%esi), %%mm3 \n\t"
195 "psubd %%mm7, %%mm0 \n\t"
196 "psubd %%mm7, %%mm1 \n\t"
197 "psubd %%mm7, %%mm2 \n\t"
198 "psubd %%mm7, %%mm3 \n\t"
199 "packssdw %%mm1, %%mm0 \n\t"
200 "packssdw %%mm3, %%mm2 \n\t"
201 "movq %%mm0, %%mm1 \n\t"
202 "punpcklwd %%mm2, %%mm0 \n\t"
203 "punpckhwd %%mm2, %%mm1 \n\t"
204 "leal (%%esi, %%esi, 2), %%edi \n\t"
205 "movq %%mm6, (%0, %%edi) \n\t"
206 "movd %%mm0, 8(%0, %%edi) \n\t"
207 "punpckhdq %%mm0, %%mm0 \n\t"
208 "movq %%mm6, 12(%0, %%edi) \n\t"
209 "movd %%mm0, 20(%0, %%edi) \n\t"
210 "movq %%mm6, 24(%0, %%edi) \n\t"
211 "movd %%mm1, 32(%0, %%edi) \n\t"
212 "punpckhdq %%mm1, %%mm1 \n\t"
213 "movq %%mm6, 36(%0, %%edi) \n\t"
214 "movd %%mm1, 44(%0, %%edi) \n\t"
215 "addl $16, %%esi \n\t"
216 " jnz 1b \n\t"
217 "emms \n\t"
218 :: "r" (s16+1536), "r" (f+256)
219 :"%esi", "%edi", "memory"
220 );
221 #else
128 for (i = 0; i < 256; i++) { 222 for (i = 0; i < 256; i++) {
129 s16[6*i] = s16[6*i+1] = s16[6*i+2] = s16[6*i+3] = 0; 223 s16[6*i] = s16[6*i+1] = s16[6*i+2] = s16[6*i+3] = 0;
130 s16[6*i+4] = convert (f[i+256]); 224 s16[6*i+4] = convert (f[i+256]);
131 s16[6*i+5] = convert (f[i]); 225 s16[6*i+5] = convert (f[i]);
132 } 226 }
227 #endif
133 break; 228 break;
134 case A52_CHANNEL | A52_LFE: 229 case A52_CHANNEL | A52_LFE:
135 case A52_STEREO | A52_LFE: 230 case A52_STEREO | A52_LFE:
136 case A52_DOLBY | A52_LFE: 231 case A52_DOLBY | A52_LFE:
137 for (i = 0; i < 256; i++) { 232 for (i = 0; i < 256; i++) {