Mercurial > libavcodec.hg
annotate x86/fdct_mmx.c @ 12483:0159a19bfff7 libavcodec
aacdec: Rework channel mapping compatibility hacks.
For a PCE based configuration map the channels solely based on tags.
For an indexed configuration map the channels solely based on position.
This works with all known exotic samples including al17, elem_id0, bad_concat,
and lfe_is_sce.
author | alexc |
---|---|
date | Fri, 10 Sep 2010 18:01:48 +0000 |
parents | 7be32921237f |
children |
rev | line source |
---|---|
8430 | 1 /* |
2 * MMX optimized forward DCT | |
3 * The gcc porting is Copyright (c) 2001 Fabrice Bellard. | |
4 * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |
5 * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. | |
6 * | |
7 * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT | |
8 * | |
9 * Intel Application Note AP-922 - fast, precise implementation of DCT | |
10 * http://developer.intel.com/vtune/cbts/appnotes.htm | |
11 * | |
12 * Also of inspiration: | |
13 * a page about fdct at http://www.geocities.com/ssavekar/dct.htm | |
14 * Skal's fdct at http://skal.planet-d.net/coding/dct.html | |
15 * | |
16 * This file is part of FFmpeg. | |
17 * | |
18 * FFmpeg is free software; you can redistribute it and/or | |
19 * modify it under the terms of the GNU Lesser General Public | |
20 * License as published by the Free Software Foundation; either | |
21 * version 2.1 of the License, or (at your option) any later version. | |
22 * | |
23 * FFmpeg is distributed in the hope that it will be useful, | |
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
26 * Lesser General Public License for more details. | |
27 * | |
28 * You should have received a copy of the GNU Lesser General Public | |
29 * License along with FFmpeg; if not, write to the Free Software | |
30 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
31 */ | |
32 | |
33 #include "libavutil/common.h" | |
34 #include "libavcodec/dsputil.h" | |
35 | |
36 ////////////////////////////////////////////////////////////////////// | |
37 // | |
38 // constants for the forward DCT | |
39 // ----------------------------- | |
40 // | |
41 // Be sure to check that your compiler is aligning all constants to QWORD | |
42 // (8-byte) memory boundaries! Otherwise the unaligned memory access will | |
43 // severely stall MMX execution. | |
44 // | |
45 ////////////////////////////////////////////////////////////////////// | |
46 | |
47 #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy | |
48 #define SHIFT_FRW_COL BITS_FRW_ACC | |
49 #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) | |
50 #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) | |
51 //#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) | |
52 | |
53 #define X8(x) x,x,x,x,x,x,x,x | |
54 | |
55 //concatenated table, for forward DCT transformation | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
56 DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = { |
8430 | 57 X8(13036), // tg * (2<<16) + 0.5 |
58 X8(27146), // tg * (2<<16) + 0.5 | |
59 X8(-21746) // tg * (2<<16) + 0.5 | |
60 }; | |
61 | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
62 DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = { |
8430 | 63 X8(23170) //cos * (2<<15) + 0.5 |
64 }; | |
65 | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
66 DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) }; |
8430 | 67 |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
68 DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW }; |
8430 | 69 |
70 static struct | |
71 { | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
72 DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4]; |
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
73 } fdct_r_row_sse2 = |
8430 | 74 {{ |
75 RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW | |
76 }}; | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
77 //DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; |
8430 | 78 |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
79 DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table |
8430 | 80 16384, 16384, 22725, 19266, |
81 16384, 16384, 12873, 4520, | |
82 21407, 8867, 19266, -4520, | |
83 -8867, -21407, -22725, -12873, | |
84 16384, -16384, 12873, -22725, | |
85 -16384, 16384, 4520, 19266, | |
86 8867, -21407, 4520, -12873, | |
87 21407, -8867, 19266, -22725, | |
88 | |
89 22725, 22725, 31521, 26722, | |
90 22725, 22725, 17855, 6270, | |
91 29692, 12299, 26722, -6270, | |
92 -12299, -29692, -31521, -17855, | |
93 22725, -22725, 17855, -31521, | |
94 -22725, 22725, 6270, 26722, | |
95 12299, -29692, 6270, -17855, | |
96 29692, -12299, 26722, -31521, | |
97 | |
98 21407, 21407, 29692, 25172, | |
99 21407, 21407, 16819, 5906, | |
100 27969, 11585, 25172, -5906, | |
101 -11585, -27969, -29692, -16819, | |
102 21407, -21407, 16819, -29692, | |
103 -21407, 21407, 5906, 25172, | |
104 11585, -27969, 5906, -16819, | |
105 27969, -11585, 25172, -29692, | |
106 | |
107 19266, 19266, 26722, 22654, | |
108 19266, 19266, 15137, 5315, | |
109 25172, 10426, 22654, -5315, | |
110 -10426, -25172, -26722, -15137, | |
111 19266, -19266, 15137, -26722, | |
112 -19266, 19266, 5315, 22654, | |
113 10426, -25172, 5315, -15137, | |
114 25172, -10426, 22654, -26722, | |
115 | |
116 16384, 16384, 22725, 19266, | |
117 16384, 16384, 12873, 4520, | |
118 21407, 8867, 19266, -4520, | |
119 -8867, -21407, -22725, -12873, | |
120 16384, -16384, 12873, -22725, | |
121 -16384, 16384, 4520, 19266, | |
122 8867, -21407, 4520, -12873, | |
123 21407, -8867, 19266, -22725, | |
124 | |
125 19266, 19266, 26722, 22654, | |
126 19266, 19266, 15137, 5315, | |
127 25172, 10426, 22654, -5315, | |
128 -10426, -25172, -26722, -15137, | |
129 19266, -19266, 15137, -26722, | |
130 -19266, 19266, 5315, 22654, | |
131 10426, -25172, 5315, -15137, | |
132 25172, -10426, 22654, -26722, | |
133 | |
134 21407, 21407, 29692, 25172, | |
135 21407, 21407, 16819, 5906, | |
136 27969, 11585, 25172, -5906, | |
137 -11585, -27969, -29692, -16819, | |
138 21407, -21407, 16819, -29692, | |
139 -21407, 21407, 5906, 25172, | |
140 11585, -27969, 5906, -16819, | |
141 27969, -11585, 25172, -29692, | |
142 | |
143 22725, 22725, 31521, 26722, | |
144 22725, 22725, 17855, 6270, | |
145 29692, 12299, 26722, -6270, | |
146 -12299, -29692, -31521, -17855, | |
147 22725, -22725, 17855, -31521, | |
148 -22725, 22725, 6270, 26722, | |
149 12299, -29692, 6270, -17855, | |
150 29692, -12299, 26722, -31521, | |
151 }; | |
152 | |
153 static struct | |
154 { | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
155 DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256]; |
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
156 } tab_frw_01234567_sse2 = |
8430 | 157 {{ |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
158 //DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table |
8430 | 159 #define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ |
160 C4, C4, C5, C7, C2, C6, C3, -C7, \ | |
161 -C4, C4, C7, C3, C6, -C2, C7, -C5, \ | |
162 C4, -C4, C5, -C1, C2, -C6, C3, -C1, | |
163 // c1..c7 * cos(pi/4) * 2^15 | |
164 #define C1 22725 | |
165 #define C2 21407 | |
166 #define C3 19266 | |
167 #define C4 16384 | |
168 #define C5 12873 | |
169 #define C6 8867 | |
170 #define C7 4520 | |
171 TABLE_SSE2 | |
172 | |
173 #undef C1 | |
174 #undef C2 | |
175 #undef C3 | |
176 #undef C4 | |
177 #undef C5 | |
178 #undef C6 | |
179 #undef C7 | |
180 #define C1 31521 | |
181 #define C2 29692 | |
182 #define C3 26722 | |
183 #define C4 22725 | |
184 #define C5 17855 | |
185 #define C6 12299 | |
186 #define C7 6270 | |
187 TABLE_SSE2 | |
188 | |
189 #undef C1 | |
190 #undef C2 | |
191 #undef C3 | |
192 #undef C4 | |
193 #undef C5 | |
194 #undef C6 | |
195 #undef C7 | |
196 #define C1 29692 | |
197 #define C2 27969 | |
198 #define C3 25172 | |
199 #define C4 21407 | |
200 #define C5 16819 | |
201 #define C6 11585 | |
202 #define C7 5906 | |
203 TABLE_SSE2 | |
204 | |
205 #undef C1 | |
206 #undef C2 | |
207 #undef C3 | |
208 #undef C4 | |
209 #undef C5 | |
210 #undef C6 | |
211 #undef C7 | |
212 #define C1 26722 | |
213 #define C2 25172 | |
214 #define C3 22654 | |
215 #define C4 19266 | |
216 #define C5 15137 | |
217 #define C6 10426 | |
218 #define C7 5315 | |
219 TABLE_SSE2 | |
220 | |
221 #undef C1 | |
222 #undef C2 | |
223 #undef C3 | |
224 #undef C4 | |
225 #undef C5 | |
226 #undef C6 | |
227 #undef C7 | |
228 #define C1 22725 | |
229 #define C2 21407 | |
230 #define C3 19266 | |
231 #define C4 16384 | |
232 #define C5 12873 | |
233 #define C6 8867 | |
234 #define C7 4520 | |
235 TABLE_SSE2 | |
236 | |
237 #undef C1 | |
238 #undef C2 | |
239 #undef C3 | |
240 #undef C4 | |
241 #undef C5 | |
242 #undef C6 | |
243 #undef C7 | |
244 #define C1 26722 | |
245 #define C2 25172 | |
246 #define C3 22654 | |
247 #define C4 19266 | |
248 #define C5 15137 | |
249 #define C6 10426 | |
250 #define C7 5315 | |
251 TABLE_SSE2 | |
252 | |
253 #undef C1 | |
254 #undef C2 | |
255 #undef C3 | |
256 #undef C4 | |
257 #undef C5 | |
258 #undef C6 | |
259 #undef C7 | |
260 #define C1 29692 | |
261 #define C2 27969 | |
262 #define C3 25172 | |
263 #define C4 21407 | |
264 #define C5 16819 | |
265 #define C6 11585 | |
266 #define C7 5906 | |
267 TABLE_SSE2 | |
268 | |
269 #undef C1 | |
270 #undef C2 | |
271 #undef C3 | |
272 #undef C4 | |
273 #undef C5 | |
274 #undef C6 | |
275 #undef C7 | |
276 #define C1 31521 | |
277 #define C2 29692 | |
278 #define C3 26722 | |
279 #define C4 22725 | |
280 #define C5 17855 | |
281 #define C6 12299 | |
282 #define C7 6270 | |
283 TABLE_SSE2 | |
284 }}; | |
285 | |
286 #define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long | |
287 | |
288 #define FDCT_COL(cpu, mm, mov)\ | |
289 static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\ | |
290 {\ | |
291 __asm__ volatile (\ | |
292 #mov" 16(%0), %%"#mm"0 \n\t" \ | |
293 #mov" 96(%0), %%"#mm"1 \n\t" \ | |
294 #mov" %%"#mm"0, %%"#mm"2 \n\t" \ | |
295 #mov" 32(%0), %%"#mm"3 \n\t" \ | |
296 "paddsw %%"#mm"1, %%"#mm"0 \n\t" \ | |
297 #mov" 80(%0), %%"#mm"4 \n\t" \ | |
298 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \ | |
299 #mov" (%0), %%"#mm"5 \n\t" \ | |
300 "paddsw %%"#mm"3, %%"#mm"4 \n\t" \ | |
301 "paddsw 112(%0), %%"#mm"5 \n\t" \ | |
302 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \ | |
303 #mov" %%"#mm"0, %%"#mm"6 \n\t" \ | |
304 "psubsw %%"#mm"1, %%"#mm"2 \n\t" \ | |
305 #mov" 16(%1), %%"#mm"1 \n\t" \ | |
306 "psubsw %%"#mm"4, %%"#mm"0 \n\t" \ | |
307 #mov" 48(%0), %%"#mm"7 \n\t" \ | |
308 "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \ | |
309 "paddsw 64(%0), %%"#mm"7 \n\t" \ | |
310 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \ | |
311 "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ | |
312 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \ | |
313 #mov" %%"#mm"5, %%"#mm"4 \n\t" \ | |
314 "psubsw %%"#mm"7, %%"#mm"5 \n\t" \ | |
315 "paddsw %%"#mm"5, %%"#mm"1 \n\t" \ | |
316 "paddsw %%"#mm"7, %%"#mm"4 \n\t" \ | |
317 "por (%2), %%"#mm"1 \n\t" \ | |
318 "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \ | |
319 "pmulhw 16(%1), %%"#mm"5 \n\t" \ | |
320 #mov" %%"#mm"4, %%"#mm"7 \n\t" \ | |
321 "psubsw 80(%0), %%"#mm"3 \n\t" \ | |
322 "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ | |
323 #mov" %%"#mm"1, 32(%3) \n\t" \ | |
324 "paddsw %%"#mm"6, %%"#mm"7 \n\t" \ | |
325 #mov" 48(%0), %%"#mm"1 \n\t" \ | |
326 "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \ | |
327 "psubsw 64(%0), %%"#mm"1 \n\t" \ | |
328 #mov" %%"#mm"2, %%"#mm"6 \n\t" \ | |
329 #mov" %%"#mm"4, 64(%3) \n\t" \ | |
330 "paddsw %%"#mm"3, %%"#mm"2 \n\t" \ | |
331 "pmulhw (%4), %%"#mm"2 \n\t" \ | |
332 "psubsw %%"#mm"3, %%"#mm"6 \n\t" \ | |
333 "pmulhw (%4), %%"#mm"6 \n\t" \ | |
334 "psubsw %%"#mm"0, %%"#mm"5 \n\t" \ | |
335 "por (%2), %%"#mm"5 \n\t" \ | |
336 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \ | |
337 "por (%2), %%"#mm"2 \n\t" \ | |
338 #mov" %%"#mm"1, %%"#mm"4 \n\t" \ | |
339 #mov" (%0), %%"#mm"3 \n\t" \ | |
340 "paddsw %%"#mm"6, %%"#mm"1 \n\t" \ | |
341 "psubsw 112(%0), %%"#mm"3 \n\t" \ | |
342 "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ | |
343 #mov" (%1), %%"#mm"0 \n\t" \ | |
344 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \ | |
345 #mov" 32(%1), %%"#mm"6 \n\t" \ | |
346 "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \ | |
347 #mov" %%"#mm"7, (%3) \n\t" \ | |
348 "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \ | |
349 #mov" %%"#mm"5, 96(%3) \n\t" \ | |
350 #mov" %%"#mm"3, %%"#mm"7 \n\t" \ | |
351 #mov" 32(%1), %%"#mm"5 \n\t" \ | |
352 "psubsw %%"#mm"2, %%"#mm"7 \n\t" \ | |
353 "paddsw %%"#mm"2, %%"#mm"3 \n\t" \ | |
354 "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \ | |
355 "paddsw %%"#mm"3, %%"#mm"0 \n\t" \ | |
356 "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ | |
357 "pmulhw (%1), %%"#mm"3 \n\t" \ | |
358 "por (%2), %%"#mm"0 \n\t" \ | |
359 "paddsw %%"#mm"7, %%"#mm"5 \n\t" \ | |
360 "psubsw %%"#mm"6, %%"#mm"7 \n\t" \ | |
361 #mov" %%"#mm"0, 16(%3) \n\t" \ | |
362 "paddsw %%"#mm"4, %%"#mm"5 \n\t" \ | |
363 #mov" %%"#mm"7, 48(%3) \n\t" \ | |
364 "psubsw %%"#mm"1, %%"#mm"3 \n\t" \ | |
365 #mov" %%"#mm"5, 80(%3) \n\t" \ | |
366 #mov" %%"#mm"3, 112(%3) \n\t" \ | |
367 : \ | |
368 : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \ | |
369 "r" (out + offset), "r" (ocos_4_16)); \ | |
370 } | |
371 | |
372 FDCT_COL(mmx, mm, movq) | |
373 FDCT_COL(sse2, xmm, movdqa) | |
374 | |
375 static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) | |
376 { | |
377 __asm__ volatile( | |
378 #define FDCT_ROW_SSE2_H1(i,t) \ | |
379 "movq " #i "(%0), %%xmm2 \n\t" \ | |
380 "movq " #i "+8(%0), %%xmm0 \n\t" \ | |
381 "movdqa " #t "+32(%1), %%xmm3 \n\t" \ | |
382 "movdqa " #t "+48(%1), %%xmm7 \n\t" \ | |
383 "movdqa " #t "(%1), %%xmm4 \n\t" \ | |
384 "movdqa " #t "+16(%1), %%xmm5 \n\t" | |
385 | |
386 #define FDCT_ROW_SSE2_H2(i,t) \ | |
387 "movq " #i "(%0), %%xmm2 \n\t" \ | |
388 "movq " #i "+8(%0), %%xmm0 \n\t" \ | |
389 "movdqa " #t "+32(%1), %%xmm3 \n\t" \ | |
390 "movdqa " #t "+48(%1), %%xmm7 \n\t" | |
391 | |
392 #define FDCT_ROW_SSE2(i) \ | |
393 "movq %%xmm2, %%xmm1 \n\t" \ | |
394 "pshuflw $27, %%xmm0, %%xmm0 \n\t" \ | |
395 "paddsw %%xmm0, %%xmm1 \n\t" \ | |
396 "psubsw %%xmm0, %%xmm2 \n\t" \ | |
397 "punpckldq %%xmm2, %%xmm1 \n\t" \ | |
398 "pshufd $78, %%xmm1, %%xmm2 \n\t" \ | |
399 "pmaddwd %%xmm2, %%xmm3 \n\t" \ | |
400 "pmaddwd %%xmm1, %%xmm7 \n\t" \ | |
401 "pmaddwd %%xmm5, %%xmm2 \n\t" \ | |
402 "pmaddwd %%xmm4, %%xmm1 \n\t" \ | |
403 "paddd %%xmm7, %%xmm3 \n\t" \ | |
404 "paddd %%xmm2, %%xmm1 \n\t" \ | |
405 "paddd %%xmm6, %%xmm3 \n\t" \ | |
406 "paddd %%xmm6, %%xmm1 \n\t" \ | |
407 "psrad %3, %%xmm3 \n\t" \ | |
408 "psrad %3, %%xmm1 \n\t" \ | |
409 "packssdw %%xmm3, %%xmm1 \n\t" \ | |
410 "movdqa %%xmm1, " #i "(%4) \n\t" | |
411 | |
412 "movdqa (%2), %%xmm6 \n\t" | |
413 FDCT_ROW_SSE2_H1(0,0) | |
414 FDCT_ROW_SSE2(0) | |
415 FDCT_ROW_SSE2_H2(64,0) | |
416 FDCT_ROW_SSE2(64) | |
417 | |
418 FDCT_ROW_SSE2_H1(16,64) | |
419 FDCT_ROW_SSE2(16) | |
420 FDCT_ROW_SSE2_H2(112,64) | |
421 FDCT_ROW_SSE2(112) | |
422 | |
423 FDCT_ROW_SSE2_H1(32,128) | |
424 FDCT_ROW_SSE2(32) | |
425 FDCT_ROW_SSE2_H2(96,128) | |
426 FDCT_ROW_SSE2(96) | |
427 | |
428 FDCT_ROW_SSE2_H1(48,192) | |
429 FDCT_ROW_SSE2(48) | |
430 FDCT_ROW_SSE2_H2(80,192) | |
431 FDCT_ROW_SSE2(80) | |
432 : | |
433 : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) | |
434 ); | |
435 } | |
436 | |
437 static av_always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table) | |
438 { | |
439 __asm__ volatile ( | |
440 "pshufw $0x1B, 8(%0), %%mm5 \n\t" | |
441 "movq (%0), %%mm0 \n\t" | |
442 "movq %%mm0, %%mm1 \n\t" | |
443 "paddsw %%mm5, %%mm0 \n\t" | |
444 "psubsw %%mm5, %%mm1 \n\t" | |
445 "movq %%mm0, %%mm2 \n\t" | |
446 "punpckldq %%mm1, %%mm0 \n\t" | |
447 "punpckhdq %%mm1, %%mm2 \n\t" | |
448 "movq (%1), %%mm1 \n\t" | |
449 "movq 8(%1), %%mm3 \n\t" | |
450 "movq 16(%1), %%mm4 \n\t" | |
451 "movq 24(%1), %%mm5 \n\t" | |
452 "movq 32(%1), %%mm6 \n\t" | |
453 "movq 40(%1), %%mm7 \n\t" | |
454 "pmaddwd %%mm0, %%mm1 \n\t" | |
455 "pmaddwd %%mm2, %%mm3 \n\t" | |
456 "pmaddwd %%mm0, %%mm4 \n\t" | |
457 "pmaddwd %%mm2, %%mm5 \n\t" | |
458 "pmaddwd %%mm0, %%mm6 \n\t" | |
459 "pmaddwd %%mm2, %%mm7 \n\t" | |
460 "pmaddwd 48(%1), %%mm0 \n\t" | |
461 "pmaddwd 56(%1), %%mm2 \n\t" | |
462 "paddd %%mm1, %%mm3 \n\t" | |
463 "paddd %%mm4, %%mm5 \n\t" | |
464 "paddd %%mm6, %%mm7 \n\t" | |
465 "paddd %%mm0, %%mm2 \n\t" | |
466 "movq (%2), %%mm0 \n\t" | |
467 "paddd %%mm0, %%mm3 \n\t" | |
468 "paddd %%mm0, %%mm5 \n\t" | |
469 "paddd %%mm0, %%mm7 \n\t" | |
470 "paddd %%mm0, %%mm2 \n\t" | |
471 "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" | |
472 "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" | |
473 "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" | |
474 "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" | |
475 "packssdw %%mm5, %%mm3 \n\t" | |
476 "packssdw %%mm2, %%mm7 \n\t" | |
477 "movq %%mm3, (%3) \n\t" | |
478 "movq %%mm7, 8(%3) \n\t" | |
479 : | |
480 : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); | |
481 } | |
482 | |
483 static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table) | |
484 { | |
485 //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...) | |
486 __asm__ volatile( | |
487 "movd 12(%0), %%mm1 \n\t" | |
488 "punpcklwd 8(%0), %%mm1 \n\t" | |
489 "movq %%mm1, %%mm2 \n\t" | |
490 "psrlq $0x20, %%mm1 \n\t" | |
491 "movq 0(%0), %%mm0 \n\t" | |
492 "punpcklwd %%mm2, %%mm1 \n\t" | |
493 "movq %%mm0, %%mm5 \n\t" | |
494 "paddsw %%mm1, %%mm0 \n\t" | |
495 "psubsw %%mm1, %%mm5 \n\t" | |
496 "movq %%mm0, %%mm2 \n\t" | |
497 "punpckldq %%mm5, %%mm0 \n\t" | |
498 "punpckhdq %%mm5, %%mm2 \n\t" | |
499 "movq 0(%1), %%mm1 \n\t" | |
500 "movq 8(%1), %%mm3 \n\t" | |
501 "movq 16(%1), %%mm4 \n\t" | |
502 "movq 24(%1), %%mm5 \n\t" | |
503 "movq 32(%1), %%mm6 \n\t" | |
504 "movq 40(%1), %%mm7 \n\t" | |
505 "pmaddwd %%mm0, %%mm1 \n\t" | |
506 "pmaddwd %%mm2, %%mm3 \n\t" | |
507 "pmaddwd %%mm0, %%mm4 \n\t" | |
508 "pmaddwd %%mm2, %%mm5 \n\t" | |
509 "pmaddwd %%mm0, %%mm6 \n\t" | |
510 "pmaddwd %%mm2, %%mm7 \n\t" | |
511 "pmaddwd 48(%1), %%mm0 \n\t" | |
512 "pmaddwd 56(%1), %%mm2 \n\t" | |
513 "paddd %%mm1, %%mm3 \n\t" | |
514 "paddd %%mm4, %%mm5 \n\t" | |
515 "paddd %%mm6, %%mm7 \n\t" | |
516 "paddd %%mm0, %%mm2 \n\t" | |
517 "movq (%2), %%mm0 \n\t" | |
518 "paddd %%mm0, %%mm3 \n\t" | |
519 "paddd %%mm0, %%mm5 \n\t" | |
520 "paddd %%mm0, %%mm7 \n\t" | |
521 "paddd %%mm0, %%mm2 \n\t" | |
522 "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" | |
523 "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" | |
524 "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" | |
525 "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" | |
526 "packssdw %%mm5, %%mm3 \n\t" | |
527 "packssdw %%mm2, %%mm7 \n\t" | |
528 "movq %%mm3, 0(%3) \n\t" | |
529 "movq %%mm7, 8(%3) \n\t" | |
530 : | |
531 : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); | |
532 } | |
533 | |
534 void ff_fdct_mmx(int16_t *block) | |
535 { | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
536 DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; |
8430 | 537 int16_t * block1= (int16_t*)align_tmp; |
538 const int16_t *table= tab_frw_01234567; | |
539 int i; | |
540 | |
541 fdct_col_mmx(block, block1, 0); | |
542 fdct_col_mmx(block, block1, 4); | |
543 | |
544 for(i=8;i>0;i--) { | |
545 fdct_row_mmx(block1, block, table); | |
546 block1 += 8; | |
547 table += 32; | |
548 block += 8; | |
549 } | |
550 } | |
551 | |
552 void ff_fdct_mmx2(int16_t *block) | |
553 { | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
554 DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; |
8430 | 555 int16_t *block1= (int16_t*)align_tmp; |
556 const int16_t *table= tab_frw_01234567; | |
557 int i; | |
558 | |
559 fdct_col_mmx(block, block1, 0); | |
560 fdct_col_mmx(block, block1, 4); | |
561 | |
562 for(i=8;i>0;i--) { | |
563 fdct_row_mmx2(block1, block, table); | |
564 block1 += 8; | |
565 table += 32; | |
566 block += 8; | |
567 } | |
568 } | |
569 | |
570 void ff_fdct_sse2(int16_t *block) | |
571 { | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
572 DECLARE_ALIGNED(16, int64_t, align_tmp)[16]; |
8430 | 573 int16_t * const block1= (int16_t*)align_tmp; |
574 | |
575 fdct_col_sse2(block, block1, 0); | |
576 fdct_row_sse2(block1, block); | |
577 } | |
578 |