Mercurial > libavcodec.hg
annotate x86/fdct_mmx.c @ 12492:58a960d6e34c libavcodec
Rename h264_idct_sse2.asm to h264_idct.asm; move inline IDCT asm from
h264dsp_mmx.c to h264_idct.asm (as yasm code). Because the loops are now
coded in asm instead of C, this is (depending on the function) up to 50%
faster for cases where gcc didn't do a great job at looping.
Since h264_idct_add8() is now faster than the manual loop setup in h264.c,
in-asm idct calling can now be enabled for chroma as well (see r16207). For
MMX, this is 5% faster. For SSE2 (which isn't done for chroma if h264.c does
the looping), this makes it up to 50% faster. Speed gain overall is ~0.5-1.0%.
author | rbultje |
---|---|
date | Tue, 14 Sep 2010 13:36:26 +0000 |
parents | 7be32921237f |
children |
rev | line source |
---|---|
8430 | 1 /* |
2 * MMX optimized forward DCT | |
3 * The gcc porting is Copyright (c) 2001 Fabrice Bellard. | |
4 * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |
5 * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. | |
6 * | |
7 * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT | |
8 * | |
9 * Intel Application Note AP-922 - fast, precise implementation of DCT | |
10 * http://developer.intel.com/vtune/cbts/appnotes.htm | |
11 * | |
12 * Also of inspiration: | |
13 * a page about fdct at http://www.geocities.com/ssavekar/dct.htm | |
14 * Skal's fdct at http://skal.planet-d.net/coding/dct.html | |
15 * | |
16 * This file is part of FFmpeg. | |
17 * | |
18 * FFmpeg is free software; you can redistribute it and/or | |
19 * modify it under the terms of the GNU Lesser General Public | |
20 * License as published by the Free Software Foundation; either | |
21 * version 2.1 of the License, or (at your option) any later version. | |
22 * | |
23 * FFmpeg is distributed in the hope that it will be useful, | |
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
26 * Lesser General Public License for more details. | |
27 * | |
28 * You should have received a copy of the GNU Lesser General Public | |
29 * License along with FFmpeg; if not, write to the Free Software | |
30 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
31 */ | |
32 | |
33 #include "libavutil/common.h" | |
34 #include "libavcodec/dsputil.h" | |
35 | |
36 ////////////////////////////////////////////////////////////////////// | |
37 // | |
38 // constants for the forward DCT | |
39 // ----------------------------- | |
40 // | |
41 // Be sure to check that your compiler is aligning all constants to QWORD | |
42 // (8-byte) memory boundaries! Otherwise the unaligned memory access will | |
43 // severely stall MMX execution. | |
44 // | |
45 ////////////////////////////////////////////////////////////////////// | |
46 | |
47 #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy | |
48 #define SHIFT_FRW_COL BITS_FRW_ACC | |
49 #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) | |
50 #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) | |
51 //#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) | |
52 | |
53 #define X8(x) x,x,x,x,x,x,x,x | |
54 | |
55 //concatenated table, for forward DCT transformation | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
56 DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = { |
8430 | 57 X8(13036), // tg * (2<<16) + 0.5 |
58 X8(27146), // tg * (2<<16) + 0.5 | |
59 X8(-21746) // tg * (2<<16) + 0.5 | |
60 }; | |
61 | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
62 DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = { |
8430 | 63 X8(23170) //cos * (2<<15) + 0.5 |
64 }; | |
65 | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
66 DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) }; |
8430 | 67 |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
68 DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW }; |
8430 | 69 |
70 static struct | |
71 { | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
72 DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4]; |
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
73 } fdct_r_row_sse2 = |
8430 | 74 {{ |
75 RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW | |
76 }}; | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
77 //DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; |
8430 | 78 |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
79 DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table |
8430 | 80 16384, 16384, 22725, 19266, |
81 16384, 16384, 12873, 4520, | |
82 21407, 8867, 19266, -4520, | |
83 -8867, -21407, -22725, -12873, | |
84 16384, -16384, 12873, -22725, | |
85 -16384, 16384, 4520, 19266, | |
86 8867, -21407, 4520, -12873, | |
87 21407, -8867, 19266, -22725, | |
88 | |
89 22725, 22725, 31521, 26722, | |
90 22725, 22725, 17855, 6270, | |
91 29692, 12299, 26722, -6270, | |
92 -12299, -29692, -31521, -17855, | |
93 22725, -22725, 17855, -31521, | |
94 -22725, 22725, 6270, 26722, | |
95 12299, -29692, 6270, -17855, | |
96 29692, -12299, 26722, -31521, | |
97 | |
98 21407, 21407, 29692, 25172, | |
99 21407, 21407, 16819, 5906, | |
100 27969, 11585, 25172, -5906, | |
101 -11585, -27969, -29692, -16819, | |
102 21407, -21407, 16819, -29692, | |
103 -21407, 21407, 5906, 25172, | |
104 11585, -27969, 5906, -16819, | |
105 27969, -11585, 25172, -29692, | |
106 | |
107 19266, 19266, 26722, 22654, | |
108 19266, 19266, 15137, 5315, | |
109 25172, 10426, 22654, -5315, | |
110 -10426, -25172, -26722, -15137, | |
111 19266, -19266, 15137, -26722, | |
112 -19266, 19266, 5315, 22654, | |
113 10426, -25172, 5315, -15137, | |
114 25172, -10426, 22654, -26722, | |
115 | |
116 16384, 16384, 22725, 19266, | |
117 16384, 16384, 12873, 4520, | |
118 21407, 8867, 19266, -4520, | |
119 -8867, -21407, -22725, -12873, | |
120 16384, -16384, 12873, -22725, | |
121 -16384, 16384, 4520, 19266, | |
122 8867, -21407, 4520, -12873, | |
123 21407, -8867, 19266, -22725, | |
124 | |
125 19266, 19266, 26722, 22654, | |
126 19266, 19266, 15137, 5315, | |
127 25172, 10426, 22654, -5315, | |
128 -10426, -25172, -26722, -15137, | |
129 19266, -19266, 15137, -26722, | |
130 -19266, 19266, 5315, 22654, | |
131 10426, -25172, 5315, -15137, | |
132 25172, -10426, 22654, -26722, | |
133 | |
134 21407, 21407, 29692, 25172, | |
135 21407, 21407, 16819, 5906, | |
136 27969, 11585, 25172, -5906, | |
137 -11585, -27969, -29692, -16819, | |
138 21407, -21407, 16819, -29692, | |
139 -21407, 21407, 5906, 25172, | |
140 11585, -27969, 5906, -16819, | |
141 27969, -11585, 25172, -29692, | |
142 | |
143 22725, 22725, 31521, 26722, | |
144 22725, 22725, 17855, 6270, | |
145 29692, 12299, 26722, -6270, | |
146 -12299, -29692, -31521, -17855, | |
147 22725, -22725, 17855, -31521, | |
148 -22725, 22725, 6270, 26722, | |
149 12299, -29692, 6270, -17855, | |
150 29692, -12299, 26722, -31521, | |
151 }; | |
152 | |
153 static struct | |
154 { | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
155 DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256]; |
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
156 } tab_frw_01234567_sse2 = |
8430 | 157 {{ |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
158 //DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table |
8430 | 159 #define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ |
160 C4, C4, C5, C7, C2, C6, C3, -C7, \ | |
161 -C4, C4, C7, C3, C6, -C2, C7, -C5, \ | |
162 C4, -C4, C5, -C1, C2, -C6, C3, -C1, | |
163 // c1..c7 * cos(pi/4) * 2^15 | |
164 #define C1 22725 | |
165 #define C2 21407 | |
166 #define C3 19266 | |
167 #define C4 16384 | |
168 #define C5 12873 | |
169 #define C6 8867 | |
170 #define C7 4520 | |
171 TABLE_SSE2 | |
172 | |
173 #undef C1 | |
174 #undef C2 | |
175 #undef C3 | |
176 #undef C4 | |
177 #undef C5 | |
178 #undef C6 | |
179 #undef C7 | |
180 #define C1 31521 | |
181 #define C2 29692 | |
182 #define C3 26722 | |
183 #define C4 22725 | |
184 #define C5 17855 | |
185 #define C6 12299 | |
186 #define C7 6270 | |
187 TABLE_SSE2 | |
188 | |
189 #undef C1 | |
190 #undef C2 | |
191 #undef C3 | |
192 #undef C4 | |
193 #undef C5 | |
194 #undef C6 | |
195 #undef C7 | |
196 #define C1 29692 | |
197 #define C2 27969 | |
198 #define C3 25172 | |
199 #define C4 21407 | |
200 #define C5 16819 | |
201 #define C6 11585 | |
202 #define C7 5906 | |
203 TABLE_SSE2 | |
204 | |
205 #undef C1 | |
206 #undef C2 | |
207 #undef C3 | |
208 #undef C4 | |
209 #undef C5 | |
210 #undef C6 | |
211 #undef C7 | |
212 #define C1 26722 | |
213 #define C2 25172 | |
214 #define C3 22654 | |
215 #define C4 19266 | |
216 #define C5 15137 | |
217 #define C6 10426 | |
218 #define C7 5315 | |
219 TABLE_SSE2 | |
220 | |
221 #undef C1 | |
222 #undef C2 | |
223 #undef C3 | |
224 #undef C4 | |
225 #undef C5 | |
226 #undef C6 | |
227 #undef C7 | |
228 #define C1 22725 | |
229 #define C2 21407 | |
230 #define C3 19266 | |
231 #define C4 16384 | |
232 #define C5 12873 | |
233 #define C6 8867 | |
234 #define C7 4520 | |
235 TABLE_SSE2 | |
236 | |
237 #undef C1 | |
238 #undef C2 | |
239 #undef C3 | |
240 #undef C4 | |
241 #undef C5 | |
242 #undef C6 | |
243 #undef C7 | |
244 #define C1 26722 | |
245 #define C2 25172 | |
246 #define C3 22654 | |
247 #define C4 19266 | |
248 #define C5 15137 | |
249 #define C6 10426 | |
250 #define C7 5315 | |
251 TABLE_SSE2 | |
252 | |
253 #undef C1 | |
254 #undef C2 | |
255 #undef C3 | |
256 #undef C4 | |
257 #undef C5 | |
258 #undef C6 | |
259 #undef C7 | |
260 #define C1 29692 | |
261 #define C2 27969 | |
262 #define C3 25172 | |
263 #define C4 21407 | |
264 #define C5 16819 | |
265 #define C6 11585 | |
266 #define C7 5906 | |
267 TABLE_SSE2 | |
268 | |
269 #undef C1 | |
270 #undef C2 | |
271 #undef C3 | |
272 #undef C4 | |
273 #undef C5 | |
274 #undef C6 | |
275 #undef C7 | |
276 #define C1 31521 | |
277 #define C2 29692 | |
278 #define C3 26722 | |
279 #define C4 22725 | |
280 #define C5 17855 | |
281 #define C6 12299 | |
282 #define C7 6270 | |
283 TABLE_SSE2 | |
284 }}; | |
285 | |
286 #define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long | |
287 | |
288 #define FDCT_COL(cpu, mm, mov)\ | |
289 static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\ | |
290 {\ | |
291 __asm__ volatile (\ | |
292 #mov" 16(%0), %%"#mm"0 \n\t" \ | |
293 #mov" 96(%0), %%"#mm"1 \n\t" \ | |
294 #mov" %%"#mm"0, %%"#mm"2 \n\t" \ | |
295 #mov" 32(%0), %%"#mm"3 \n\t" \ | |
296 "paddsw %%"#mm"1, %%"#mm"0 \n\t" \ | |
297 #mov" 80(%0), %%"#mm"4 \n\t" \ | |
298 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \ | |
299 #mov" (%0), %%"#mm"5 \n\t" \ | |
300 "paddsw %%"#mm"3, %%"#mm"4 \n\t" \ | |
301 "paddsw 112(%0), %%"#mm"5 \n\t" \ | |
302 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \ | |
303 #mov" %%"#mm"0, %%"#mm"6 \n\t" \ | |
304 "psubsw %%"#mm"1, %%"#mm"2 \n\t" \ | |
305 #mov" 16(%1), %%"#mm"1 \n\t" \ | |
306 "psubsw %%"#mm"4, %%"#mm"0 \n\t" \ | |
307 #mov" 48(%0), %%"#mm"7 \n\t" \ | |
308 "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \ | |
309 "paddsw 64(%0), %%"#mm"7 \n\t" \ | |
310 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \ | |
311 "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ | |
312 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \ | |
313 #mov" %%"#mm"5, %%"#mm"4 \n\t" \ | |
314 "psubsw %%"#mm"7, %%"#mm"5 \n\t" \ | |
315 "paddsw %%"#mm"5, %%"#mm"1 \n\t" \ | |
316 "paddsw %%"#mm"7, %%"#mm"4 \n\t" \ | |
317 "por (%2), %%"#mm"1 \n\t" \ | |
318 "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \ | |
319 "pmulhw 16(%1), %%"#mm"5 \n\t" \ | |
320 #mov" %%"#mm"4, %%"#mm"7 \n\t" \ | |
321 "psubsw 80(%0), %%"#mm"3 \n\t" \ | |
322 "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ | |
323 #mov" %%"#mm"1, 32(%3) \n\t" \ | |
324 "paddsw %%"#mm"6, %%"#mm"7 \n\t" \ | |
325 #mov" 48(%0), %%"#mm"1 \n\t" \ | |
326 "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \ | |
327 "psubsw 64(%0), %%"#mm"1 \n\t" \ | |
328 #mov" %%"#mm"2, %%"#mm"6 \n\t" \ | |
329 #mov" %%"#mm"4, 64(%3) \n\t" \ | |
330 "paddsw %%"#mm"3, %%"#mm"2 \n\t" \ | |
331 "pmulhw (%4), %%"#mm"2 \n\t" \ | |
332 "psubsw %%"#mm"3, %%"#mm"6 \n\t" \ | |
333 "pmulhw (%4), %%"#mm"6 \n\t" \ | |
334 "psubsw %%"#mm"0, %%"#mm"5 \n\t" \ | |
335 "por (%2), %%"#mm"5 \n\t" \ | |
336 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \ | |
337 "por (%2), %%"#mm"2 \n\t" \ | |
338 #mov" %%"#mm"1, %%"#mm"4 \n\t" \ | |
339 #mov" (%0), %%"#mm"3 \n\t" \ | |
340 "paddsw %%"#mm"6, %%"#mm"1 \n\t" \ | |
341 "psubsw 112(%0), %%"#mm"3 \n\t" \ | |
342 "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ | |
343 #mov" (%1), %%"#mm"0 \n\t" \ | |
344 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \ | |
345 #mov" 32(%1), %%"#mm"6 \n\t" \ | |
346 "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \ | |
347 #mov" %%"#mm"7, (%3) \n\t" \ | |
348 "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \ | |
349 #mov" %%"#mm"5, 96(%3) \n\t" \ | |
350 #mov" %%"#mm"3, %%"#mm"7 \n\t" \ | |
351 #mov" 32(%1), %%"#mm"5 \n\t" \ | |
352 "psubsw %%"#mm"2, %%"#mm"7 \n\t" \ | |
353 "paddsw %%"#mm"2, %%"#mm"3 \n\t" \ | |
354 "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \ | |
355 "paddsw %%"#mm"3, %%"#mm"0 \n\t" \ | |
356 "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ | |
357 "pmulhw (%1), %%"#mm"3 \n\t" \ | |
358 "por (%2), %%"#mm"0 \n\t" \ | |
359 "paddsw %%"#mm"7, %%"#mm"5 \n\t" \ | |
360 "psubsw %%"#mm"6, %%"#mm"7 \n\t" \ | |
361 #mov" %%"#mm"0, 16(%3) \n\t" \ | |
362 "paddsw %%"#mm"4, %%"#mm"5 \n\t" \ | |
363 #mov" %%"#mm"7, 48(%3) \n\t" \ | |
364 "psubsw %%"#mm"1, %%"#mm"3 \n\t" \ | |
365 #mov" %%"#mm"5, 80(%3) \n\t" \ | |
366 #mov" %%"#mm"3, 112(%3) \n\t" \ | |
367 : \ | |
368 : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \ | |
369 "r" (out + offset), "r" (ocos_4_16)); \ | |
370 } | |
371 | |
372 FDCT_COL(mmx, mm, movq) | |
373 FDCT_COL(sse2, xmm, movdqa) | |
374 | |
375 static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) | |
376 { | |
377 __asm__ volatile( | |
378 #define FDCT_ROW_SSE2_H1(i,t) \ | |
379 "movq " #i "(%0), %%xmm2 \n\t" \ | |
380 "movq " #i "+8(%0), %%xmm0 \n\t" \ | |
381 "movdqa " #t "+32(%1), %%xmm3 \n\t" \ | |
382 "movdqa " #t "+48(%1), %%xmm7 \n\t" \ | |
383 "movdqa " #t "(%1), %%xmm4 \n\t" \ | |
384 "movdqa " #t "+16(%1), %%xmm5 \n\t" | |
385 | |
386 #define FDCT_ROW_SSE2_H2(i,t) \ | |
387 "movq " #i "(%0), %%xmm2 \n\t" \ | |
388 "movq " #i "+8(%0), %%xmm0 \n\t" \ | |
389 "movdqa " #t "+32(%1), %%xmm3 \n\t" \ | |
390 "movdqa " #t "+48(%1), %%xmm7 \n\t" | |
391 | |
392 #define FDCT_ROW_SSE2(i) \ | |
393 "movq %%xmm2, %%xmm1 \n\t" \ | |
394 "pshuflw $27, %%xmm0, %%xmm0 \n\t" \ | |
395 "paddsw %%xmm0, %%xmm1 \n\t" \ | |
396 "psubsw %%xmm0, %%xmm2 \n\t" \ | |
397 "punpckldq %%xmm2, %%xmm1 \n\t" \ | |
398 "pshufd $78, %%xmm1, %%xmm2 \n\t" \ | |
399 "pmaddwd %%xmm2, %%xmm3 \n\t" \ | |
400 "pmaddwd %%xmm1, %%xmm7 \n\t" \ | |
401 "pmaddwd %%xmm5, %%xmm2 \n\t" \ | |
402 "pmaddwd %%xmm4, %%xmm1 \n\t" \ | |
403 "paddd %%xmm7, %%xmm3 \n\t" \ | |
404 "paddd %%xmm2, %%xmm1 \n\t" \ | |
405 "paddd %%xmm6, %%xmm3 \n\t" \ | |
406 "paddd %%xmm6, %%xmm1 \n\t" \ | |
407 "psrad %3, %%xmm3 \n\t" \ | |
408 "psrad %3, %%xmm1 \n\t" \ | |
409 "packssdw %%xmm3, %%xmm1 \n\t" \ | |
410 "movdqa %%xmm1, " #i "(%4) \n\t" | |
411 | |
412 "movdqa (%2), %%xmm6 \n\t" | |
413 FDCT_ROW_SSE2_H1(0,0) | |
414 FDCT_ROW_SSE2(0) | |
415 FDCT_ROW_SSE2_H2(64,0) | |
416 FDCT_ROW_SSE2(64) | |
417 | |
418 FDCT_ROW_SSE2_H1(16,64) | |
419 FDCT_ROW_SSE2(16) | |
420 FDCT_ROW_SSE2_H2(112,64) | |
421 FDCT_ROW_SSE2(112) | |
422 | |
423 FDCT_ROW_SSE2_H1(32,128) | |
424 FDCT_ROW_SSE2(32) | |
425 FDCT_ROW_SSE2_H2(96,128) | |
426 FDCT_ROW_SSE2(96) | |
427 | |
428 FDCT_ROW_SSE2_H1(48,192) | |
429 FDCT_ROW_SSE2(48) | |
430 FDCT_ROW_SSE2_H2(80,192) | |
431 FDCT_ROW_SSE2(80) | |
432 : | |
433 : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) | |
434 ); | |
435 } | |
436 | |
437 static av_always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table) | |
438 { | |
439 __asm__ volatile ( | |
440 "pshufw $0x1B, 8(%0), %%mm5 \n\t" | |
441 "movq (%0), %%mm0 \n\t" | |
442 "movq %%mm0, %%mm1 \n\t" | |
443 "paddsw %%mm5, %%mm0 \n\t" | |
444 "psubsw %%mm5, %%mm1 \n\t" | |
445 "movq %%mm0, %%mm2 \n\t" | |
446 "punpckldq %%mm1, %%mm0 \n\t" | |
447 "punpckhdq %%mm1, %%mm2 \n\t" | |
448 "movq (%1), %%mm1 \n\t" | |
449 "movq 8(%1), %%mm3 \n\t" | |
450 "movq 16(%1), %%mm4 \n\t" | |
451 "movq 24(%1), %%mm5 \n\t" | |
452 "movq 32(%1), %%mm6 \n\t" | |
453 "movq 40(%1), %%mm7 \n\t" | |
454 "pmaddwd %%mm0, %%mm1 \n\t" | |
455 "pmaddwd %%mm2, %%mm3 \n\t" | |
456 "pmaddwd %%mm0, %%mm4 \n\t" | |
457 "pmaddwd %%mm2, %%mm5 \n\t" | |
458 "pmaddwd %%mm0, %%mm6 \n\t" | |
459 "pmaddwd %%mm2, %%mm7 \n\t" | |
460 "pmaddwd 48(%1), %%mm0 \n\t" | |
461 "pmaddwd 56(%1), %%mm2 \n\t" | |
462 "paddd %%mm1, %%mm3 \n\t" | |
463 "paddd %%mm4, %%mm5 \n\t" | |
464 "paddd %%mm6, %%mm7 \n\t" | |
465 "paddd %%mm0, %%mm2 \n\t" | |
466 "movq (%2), %%mm0 \n\t" | |
467 "paddd %%mm0, %%mm3 \n\t" | |
468 "paddd %%mm0, %%mm5 \n\t" | |
469 "paddd %%mm0, %%mm7 \n\t" | |
470 "paddd %%mm0, %%mm2 \n\t" | |
471 "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" | |
472 "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" | |
473 "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" | |
474 "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" | |
475 "packssdw %%mm5, %%mm3 \n\t" | |
476 "packssdw %%mm2, %%mm7 \n\t" | |
477 "movq %%mm3, (%3) \n\t" | |
478 "movq %%mm7, 8(%3) \n\t" | |
479 : | |
480 : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); | |
481 } | |
482 | |
483 static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table) | |
484 { | |
485 //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...) | |
486 __asm__ volatile( | |
487 "movd 12(%0), %%mm1 \n\t" | |
488 "punpcklwd 8(%0), %%mm1 \n\t" | |
489 "movq %%mm1, %%mm2 \n\t" | |
490 "psrlq $0x20, %%mm1 \n\t" | |
491 "movq 0(%0), %%mm0 \n\t" | |
492 "punpcklwd %%mm2, %%mm1 \n\t" | |
493 "movq %%mm0, %%mm5 \n\t" | |
494 "paddsw %%mm1, %%mm0 \n\t" | |
495 "psubsw %%mm1, %%mm5 \n\t" | |
496 "movq %%mm0, %%mm2 \n\t" | |
497 "punpckldq %%mm5, %%mm0 \n\t" | |
498 "punpckhdq %%mm5, %%mm2 \n\t" | |
499 "movq 0(%1), %%mm1 \n\t" | |
500 "movq 8(%1), %%mm3 \n\t" | |
501 "movq 16(%1), %%mm4 \n\t" | |
502 "movq 24(%1), %%mm5 \n\t" | |
503 "movq 32(%1), %%mm6 \n\t" | |
504 "movq 40(%1), %%mm7 \n\t" | |
505 "pmaddwd %%mm0, %%mm1 \n\t" | |
506 "pmaddwd %%mm2, %%mm3 \n\t" | |
507 "pmaddwd %%mm0, %%mm4 \n\t" | |
508 "pmaddwd %%mm2, %%mm5 \n\t" | |
509 "pmaddwd %%mm0, %%mm6 \n\t" | |
510 "pmaddwd %%mm2, %%mm7 \n\t" | |
511 "pmaddwd 48(%1), %%mm0 \n\t" | |
512 "pmaddwd 56(%1), %%mm2 \n\t" | |
513 "paddd %%mm1, %%mm3 \n\t" | |
514 "paddd %%mm4, %%mm5 \n\t" | |
515 "paddd %%mm6, %%mm7 \n\t" | |
516 "paddd %%mm0, %%mm2 \n\t" | |
517 "movq (%2), %%mm0 \n\t" | |
518 "paddd %%mm0, %%mm3 \n\t" | |
519 "paddd %%mm0, %%mm5 \n\t" | |
520 "paddd %%mm0, %%mm7 \n\t" | |
521 "paddd %%mm0, %%mm2 \n\t" | |
522 "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" | |
523 "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" | |
524 "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" | |
525 "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" | |
526 "packssdw %%mm5, %%mm3 \n\t" | |
527 "packssdw %%mm2, %%mm7 \n\t" | |
528 "movq %%mm3, 0(%3) \n\t" | |
529 "movq %%mm7, 8(%3) \n\t" | |
530 : | |
531 : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); | |
532 } | |
533 | |
534 void ff_fdct_mmx(int16_t *block) | |
535 { | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
536 DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; |
8430 | 537 int16_t * block1= (int16_t*)align_tmp; |
538 const int16_t *table= tab_frw_01234567; | |
539 int i; | |
540 | |
541 fdct_col_mmx(block, block1, 0); | |
542 fdct_col_mmx(block, block1, 4); | |
543 | |
544 for(i=8;i>0;i--) { | |
545 fdct_row_mmx(block1, block, table); | |
546 block1 += 8; | |
547 table += 32; | |
548 block += 8; | |
549 } | |
550 } | |
551 | |
552 void ff_fdct_mmx2(int16_t *block) | |
553 { | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
554 DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; |
8430 | 555 int16_t *block1= (int16_t*)align_tmp; |
556 const int16_t *table= tab_frw_01234567; | |
557 int i; | |
558 | |
559 fdct_col_mmx(block, block1, 0); | |
560 fdct_col_mmx(block, block1, 4); | |
561 | |
562 for(i=8;i>0;i--) { | |
563 fdct_row_mmx2(block1, block, table); | |
564 block1 += 8; | |
565 table += 32; | |
566 block += 8; | |
567 } | |
568 } | |
569 | |
570 void ff_fdct_sse2(int16_t *block) | |
571 { | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
8430
diff
changeset
|
572 DECLARE_ALIGNED(16, int64_t, align_tmp)[16]; |
8430 | 573 int16_t * const block1= (int16_t*)align_tmp; |
574 | |
575 fdct_col_sse2(block, block1, 0); | |
576 fdct_row_sse2(block1, block); | |
577 } | |
578 |