Mercurial > libavcodec.hg
comparison x86/dct32_sse.c @ 12100:db9ef48dc0e4 libavcodec
Move SSE optimized 32-point DCT to its own file. Should fix breakage with YASM
disabled.
author | vitor |
---|---|
date | Tue, 06 Jul 2010 17:48:23 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
12099:1bf322283429 | 12100:db9ef48dc0e4 |
---|---|
1 /* | |
2 * 32 point SSE-optimized DCT transform | |
3 * Copyright (c) 2010 Vitor Sessak | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 | |
22 #include <stdint.h> | |
23 | |
24 #include "libavutil/x86_cpu.h" | |
25 #include "libavutil/mem.h" | |
26 #include "libavcodec/dsputil.h" | |
27 #include "fft.h" | |
28 | |
29 DECLARE_ALIGNED(16, static const float, b1)[] = { | |
30 0.500603, 0.505471, 0.515447, 0.531043, | |
31 0.553104, 0.582935, 0.622504, 0.674808, | |
32 -1.169440, -0.972568, -0.839350, -0.744536, | |
33 -10.190008, -3.407609, -2.057781, -1.484165, | |
34 0.502419, 0.522499, 0.566944, 0.646822, | |
35 0.788155, 1.060678, 1.722447, 5.101149, | |
36 0.509796, 0.601345, 0.899976, 2.562916, | |
37 1.000000, 1.000000, 1.306563, 0.541196, | |
38 1.000000, 0.707107, 1.000000, -0.707107 | |
39 }; | |
40 | |
41 DECLARE_ALIGNED(16, static const int32_t, smask)[4] = { | |
42 0, 0, 0x80000000, 0x80000000 | |
43 }; | |
44 | |
45 /* butterfly operator */ | |
46 #define BUTTERFLY(a,b,c,tmp) \ | |
47 "movaps %%" #a ", %%" #tmp " \n\t" \ | |
48 "subps %%" #b ", %%" #a " \n\t" \ | |
49 "addps %%" #tmp ", %%" #b " \n\t" \ | |
50 "mulps " #c ", %%" #a " \n\t" | |
51 | |
52 ///* Same as BUTTERFLY when vectors a and b overlap */ | |
53 #define BUTTERFLY0(val, mask, cos, tmp, shuf) \ | |
54 "movaps %%" #val ", %%" #tmp " \n\t" \ | |
55 "shufps " #shuf ", %%" #val ",%%" #val " \n\t" \ | |
56 "xorps %%" #mask ", %%" #tmp " \n\t" /* flip signs */ \ | |
57 "addps %%" #tmp ", %%" #val " \n\t" \ | |
58 "mulps %%" #cos ", %%" #val " \n\t" | |
59 | |
60 #define BUTTERFLY2(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0x1b) | |
61 #define BUTTERFLY3(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0xb1) | |
62 | |
63 void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) | |
64 { | |
65 int32_t tmp1 = 0; | |
66 __asm__ volatile( | |
67 /* pass 1 */ | |
68 | |
69 "movaps (%4), %%xmm0 \n\t" | |
70 "movaps 112(%4), %%xmm1 \n\t" | |
71 "shufps $0x1b, %%xmm1, %%xmm1 \n\t" | |
72 BUTTERFLY(xmm0, xmm1, (%2), xmm3) | |
73 | |
74 "movaps 64(%4), %%xmm7 \n\t" | |
75 "movaps 48(%4), %%xmm4 \n\t" | |
76 "shufps $0x1b, %%xmm4, %%xmm4 \n\t" | |
77 BUTTERFLY(xmm7, xmm4, 48(%2), xmm3) | |
78 | |
79 | |
80 /* pass 2 */ | |
81 "movaps 64(%2), %%xmm2 \n\t" | |
82 BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3) | |
83 "movaps %%xmm1, 48(%1) \n\t" | |
84 "movaps %%xmm4, (%1) \n\t" | |
85 | |
86 /* pass 1 */ | |
87 "movaps 16(%4), %%xmm1 \n\t" | |
88 "movaps 96(%4), %%xmm6 \n\t" | |
89 "shufps $0x1b, %%xmm6, %%xmm6 \n\t" | |
90 BUTTERFLY(xmm1, xmm6, 16(%2), xmm3) | |
91 | |
92 "movaps 80(%4), %%xmm4 \n\t" | |
93 "movaps 32(%4), %%xmm5 \n\t" | |
94 "shufps $0x1b, %%xmm5, %%xmm5 \n\t" | |
95 BUTTERFLY(xmm4, xmm5, 32(%2), xmm3) | |
96 | |
97 /* pass 2 */ | |
98 BUTTERFLY(xmm0, xmm7, %%xmm2, xmm3) | |
99 | |
100 "movaps 80(%2), %%xmm2 \n\t" | |
101 BUTTERFLY(xmm6, xmm5, %%xmm2, xmm3) | |
102 | |
103 BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3) | |
104 | |
105 /* pass 3 */ | |
106 "movaps 96(%2), %%xmm2 \n\t" | |
107 "shufps $0x1b, %%xmm1, %%xmm1 \n\t" | |
108 BUTTERFLY(xmm0, xmm1, %%xmm2, xmm3) | |
109 "movaps %%xmm0, 112(%1) \n\t" | |
110 "movaps %%xmm1, 96(%1) \n\t" | |
111 | |
112 "movaps 0(%1), %%xmm0 \n\t" | |
113 "shufps $0x1b, %%xmm5, %%xmm5 \n\t" | |
114 BUTTERFLY(xmm0, xmm5, %%xmm2, xmm3) | |
115 | |
116 "movaps 48(%1), %%xmm1 \n\t" | |
117 "shufps $0x1b, %%xmm6, %%xmm6 \n\t" | |
118 BUTTERFLY(xmm1, xmm6, %%xmm2, xmm3) | |
119 "movaps %%xmm1, 48(%1) \n\t" | |
120 | |
121 "shufps $0x1b, %%xmm4, %%xmm4 \n\t" | |
122 BUTTERFLY(xmm7, xmm4, %%xmm2, xmm3) | |
123 | |
124 /* pass 4 */ | |
125 "movaps (%3), %%xmm3 \n\t" | |
126 "movaps 112(%2), %%xmm2 \n\t" | |
127 | |
128 BUTTERFLY2(xmm5, xmm3, xmm2, xmm1) | |
129 | |
130 BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) | |
131 "movaps %%xmm0, 16(%1) \n\t" | |
132 | |
133 BUTTERFLY2(xmm6, xmm3, xmm2, xmm1) | |
134 "movaps %%xmm6, 32(%1) \n\t" | |
135 | |
136 "movaps 48(%1), %%xmm0 \n\t" | |
137 BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) | |
138 "movaps %%xmm0, 48(%1) \n\t" | |
139 | |
140 BUTTERFLY2(xmm4, xmm3, xmm2, xmm1) | |
141 | |
142 BUTTERFLY2(xmm7, xmm3, xmm2, xmm1) | |
143 | |
144 "movaps 96(%1), %%xmm6 \n\t" | |
145 BUTTERFLY2(xmm6, xmm3, xmm2, xmm1) | |
146 | |
147 "movaps 112(%1), %%xmm0 \n\t" | |
148 BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) | |
149 | |
150 /* pass 5 */ | |
151 "movaps 128(%2), %%xmm2 \n\t" | |
152 "shufps $0xCC, %%xmm3,%%xmm3 \n\t" | |
153 | |
154 BUTTERFLY3(xmm5, xmm3, xmm2, xmm1) | |
155 "movaps %%xmm5, (%1) \n\t" | |
156 | |
157 "movaps 16(%1), %%xmm1 \n\t" | |
158 BUTTERFLY3(xmm1, xmm3, xmm2, xmm5) | |
159 "movaps %%xmm1, 16(%1) \n\t" | |
160 | |
161 BUTTERFLY3(xmm4, xmm3, xmm2, xmm5) | |
162 "movaps %%xmm4, 64(%1) \n\t" | |
163 | |
164 BUTTERFLY3(xmm7, xmm3, xmm2, xmm5) | |
165 "movaps %%xmm7, 80(%1) \n\t" | |
166 | |
167 "movaps 32(%1), %%xmm5 \n\t" | |
168 BUTTERFLY3(xmm5, xmm3, xmm2, xmm7) | |
169 "movaps %%xmm5, 32(%1) \n\t" | |
170 | |
171 "movaps 48(%1), %%xmm4 \n\t" | |
172 BUTTERFLY3(xmm4, xmm3, xmm2, xmm7) | |
173 "movaps %%xmm4, 48(%1) \n\t" | |
174 | |
175 BUTTERFLY3(xmm6, xmm3, xmm2, xmm7) | |
176 "movaps %%xmm6, 96(%1) \n\t" | |
177 | |
178 BUTTERFLY3(xmm0, xmm3, xmm2, xmm7) | |
179 "movaps %%xmm0, 112(%1) \n\t" | |
180 | |
181 | |
182 /* pass 6, no SIMD... */ | |
183 "movss 56(%1), %%xmm3 \n\t" | |
184 "movl 4(%1), %0 \n\t" | |
185 "addss 60(%1), %%xmm3 \n\t" | |
186 "movss 72(%1), %%xmm7 \n\t" | |
187 "addss %%xmm3, %%xmm4 \n\t" | |
188 "movss 52(%1), %%xmm2 \n\t" | |
189 "addss %%xmm3, %%xmm2 \n\t" | |
190 "movss 24(%1), %%xmm3 \n\t" | |
191 "addss 28(%1), %%xmm3 \n\t" | |
192 "addss 76(%1), %%xmm7 \n\t" | |
193 "addss %%xmm3, %%xmm1 \n\t" | |
194 "addss %%xmm4, %%xmm5 \n\t" | |
195 "movss %%xmm1, 16(%1) \n\t" | |
196 "movss 20(%1), %%xmm1 \n\t" | |
197 "addss %%xmm3, %%xmm1 \n\t" | |
198 "movss 40(%1), %%xmm3 \n\t" | |
199 "movss %%xmm1, 48(%1) \n\t" | |
200 "addss 44(%1), %%xmm3 \n\t" | |
201 "movss 20(%1), %%xmm1 \n\t" | |
202 "addss %%xmm3, %%xmm4 \n\t" | |
203 "addss %%xmm2, %%xmm3 \n\t" | |
204 "addss 28(%1), %%xmm1 \n\t" | |
205 "movss %%xmm3, 40(%1) \n\t" | |
206 "addss 36(%1), %%xmm2 \n\t" | |
207 "movss 8(%1), %%xmm3 \n\t" | |
208 "movss %%xmm2, 56(%1) \n\t" | |
209 "addss 12(%1), %%xmm3 \n\t" | |
210 "movss %%xmm5, 8(%1) \n\t" | |
211 "movss %%xmm3, 32(%1) \n\t" | |
212 "movss 52(%1), %%xmm2 \n\t" | |
213 "movss 80(%1), %%xmm3 \n\t" | |
214 "movss 120(%1), %%xmm5 \n\t" | |
215 "movss %%xmm1, 80(%1) \n\t" | |
216 "movss %%xmm4, 24(%1) \n\t" | |
217 "addss 124(%1), %%xmm5 \n\t" | |
218 "movss 64(%1), %%xmm1 \n\t" | |
219 "addss 60(%1), %%xmm2 \n\t" | |
220 "addss %%xmm5, %%xmm0 \n\t" | |
221 "addss 116(%1), %%xmm5 \n\t" | |
222 "movl %0, 64(%1) \n\t" | |
223 "addss %%xmm0, %%xmm6 \n\t" | |
224 "addss %%xmm6, %%xmm1 \n\t" | |
225 "movl 12(%1), %0 \n\t" | |
226 "movss %%xmm1, 4(%1) \n\t" | |
227 "movss 88(%1), %%xmm1 \n\t" | |
228 "movl %0, 96(%1) \n\t" | |
229 "addss 92(%1), %%xmm1 \n\t" | |
230 "movss 104(%1), %%xmm4 \n\t" | |
231 "movl 28(%1), %0 \n\t" | |
232 "addss 108(%1), %%xmm4 \n\t" | |
233 "addss %%xmm4, %%xmm0 \n\t" | |
234 "addss %%xmm1, %%xmm3 \n\t" | |
235 "addss 84(%1), %%xmm1 \n\t" | |
236 "addss %%xmm5, %%xmm4 \n\t" | |
237 "addss %%xmm3, %%xmm6 \n\t" | |
238 "addss %%xmm0, %%xmm3 \n\t" | |
239 "addss %%xmm7, %%xmm0 \n\t" | |
240 "addss 100(%1), %%xmm5 \n\t" | |
241 "addss %%xmm4, %%xmm7 \n\t" | |
242 "movl %0, 112(%1) \n\t" | |
243 "movss %%xmm0, 28(%1) \n\t" | |
244 "movss 36(%1), %%xmm0 \n\t" | |
245 "movss %%xmm7, 36(%1) \n\t" | |
246 "addss %%xmm1, %%xmm4 \n\t" | |
247 "movss 116(%1), %%xmm7 \n\t" | |
248 "addss %%xmm2, %%xmm0 \n\t" | |
249 "addss 124(%1), %%xmm7 \n\t" | |
250 "movss %%xmm0, 72(%1) \n\t" | |
251 "movss 44(%1), %%xmm0 \n\t" | |
252 "movss %%xmm6, 12(%1) \n\t" | |
253 "movss %%xmm3, 20(%1) \n\t" | |
254 "addss %%xmm0, %%xmm2 \n\t" | |
255 "movss %%xmm4, 44(%1) \n\t" | |
256 "movss %%xmm2, 88(%1) \n\t" | |
257 "addss 60(%1), %%xmm0 \n\t" | |
258 "movl 60(%1), %0 \n\t" | |
259 "movl %0, 120(%1) \n\t" | |
260 "movss %%xmm0, 104(%1) \n\t" | |
261 "addss %%xmm5, %%xmm1 \n\t" | |
262 "addss 68(%1), %%xmm5 \n\t" | |
263 "movss %%xmm1, 52(%1) \n\t" | |
264 "movss %%xmm5, 60(%1) \n\t" | |
265 "movss 68(%1), %%xmm1 \n\t" | |
266 "movss 100(%1), %%xmm5 \n\t" | |
267 "addss %%xmm7, %%xmm5 \n\t" | |
268 "addss 108(%1), %%xmm7 \n\t" | |
269 "addss %%xmm5, %%xmm1 \n\t" | |
270 "movss 84(%1), %%xmm2 \n\t" | |
271 "addss 92(%1), %%xmm2 \n\t" | |
272 "addss %%xmm2, %%xmm5 \n\t" | |
273 "movss %%xmm1, 68(%1) \n\t" | |
274 "addss %%xmm7, %%xmm2 \n\t" | |
275 "movss 76(%1), %%xmm1 \n\t" | |
276 "movss %%xmm2, 84(%1) \n\t" | |
277 "movss %%xmm5, 76(%1) \n\t" | |
278 "movss 108(%1), %%xmm2 \n\t" | |
279 "addss %%xmm1, %%xmm7 \n\t" | |
280 "addss 124(%1), %%xmm2 \n\t" | |
281 "addss %%xmm2, %%xmm1 \n\t" | |
282 "addss 92(%1), %%xmm2 \n\t" | |
283 "movss %%xmm1, 100(%1) \n\t" | |
284 "movss %%xmm2, 108(%1) \n\t" | |
285 "movss 92(%1), %%xmm2 \n\t" | |
286 "movss %%xmm7, 92(%1) \n\t" | |
287 "addss 124(%1), %%xmm2 \n\t" | |
288 "movss %%xmm2, 116(%1) \n\t" | |
289 :"+&r"(tmp1) | |
290 :"r"(out), "r"(b1), "r"(smask), "r"(in) | |
291 :"memory" | |
292 ); | |
293 } | |
294 |