Mercurial > libavcodec.hg
annotate i386/fft_3dn2.c @ 3565:f086f8868bb6 libavcodec
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
Patch by John Dalgliesh % johnd AH defyne P org %
Original thread:
Date: Aug 8, 2006 8:12 PM
Subject: Re: [Ffmpeg-devel] [PATCH] '/nop' illegal for old versions of GAS
author | gpoirier |
---|---|
date | Thu, 10 Aug 2006 15:26:18 +0000 |
parents | 97325fecd35a |
children | a3d97c60ea07 |
rev | line source |
---|---|
3175 | 1 /* |
2 * FFT/MDCT transform with Extended 3DNow! optimizations | |
3555 | 3 * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt |
3175 | 4 * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard. |
5 * | |
6 * This library is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2 of the License, or (at your option) any later version. | |
10 * | |
11 * This library is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with this library; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 */ | |
20 #include "../dsputil.h" | |
21 #include <math.h> | |
22 | |
23 #ifdef HAVE_MM3DNOW | |
24 | |
25 #include <mm3dnow.h> | |
26 | |
27 static const int p1m1[2] __attribute__((aligned(8))) = | |
28 { 0, 1 << 31 }; | |
29 | |
30 static const int m1p1[2] __attribute__((aligned(8))) = | |
31 { 1 << 31, 0 }; | |
32 | |
33 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) | |
34 { | |
35 int ln = s->nbits; | |
36 int j, np, np2; | |
37 int nblocks, nloops; | |
38 register FFTComplex *p, *q; | |
39 FFTComplex *cptr, *cptr1; | |
40 int k; | |
41 | |
42 np = 1 << ln; | |
43 /* FEMMS is not a must here but recommended by AMD */ | |
44 _m_femms(); | |
45 | |
46 { | |
47 __m64 *r, a0, a1, b0, b1, c; | |
48 | |
49 r = (__m64 *)&z[0]; | |
50 if (s->inverse) | |
51 c = *(__m64 *)m1p1; | |
52 else | |
53 c = *(__m64 *)p1m1; | |
54 | |
55 j = (np >> 2); | |
56 do { | |
57 /* do the pass 0 butterfly */ | |
58 a0 = _m_pfadd(r[0], r[1]); | |
59 a1 = _m_pfsub(r[0], r[1]); | |
60 | |
61 /* do the pass 0 butterfly */ | |
62 b0 = _m_pfadd(r[2], r[3]); | |
63 b1 = _m_pfsub(r[2], r[3]); | |
64 | |
65 /* multiply third by -i */ | |
66 b1 = _m_pswapd(b1); | |
67 b1 = _m_pxor(b1, c); | |
68 | |
69 r[0] = _m_pfadd(a0, b0); | |
70 r[1] = _m_pfadd(a1, b1); | |
71 r[2] = _m_pfsub(a0, b0); | |
72 r[3] = _m_pfsub(a1, b1); | |
73 r += 4; | |
74 } while (--j != 0); | |
75 } | |
76 /* pass 2 .. ln-1 */ | |
77 | |
78 nblocks = np >> 3; | |
79 nloops = 1 << 2; | |
80 np2 = np >> 1; | |
81 | |
82 cptr1 = s->exptab1; | |
83 do { | |
84 p = z; | |
85 q = z + nloops; | |
86 j = nblocks; | |
87 do { | |
88 cptr = cptr1; | |
89 k = nloops >> 1; | |
90 do { | |
91 __m64 a0, a1, b0, b1, c0, c1, t10, t11, t20, t21; | |
92 | |
93 a0 = *(__m64 *)&p[0]; | |
94 a1 = *(__m64 *)&p[1]; | |
95 b0 = *(__m64 *)&q[0]; | |
96 b1 = *(__m64 *)&q[1]; | |
97 | |
98 /* complex mul */ | |
99 c0 = *(__m64 *)&cptr[0]; | |
100 c1 = *(__m64 *)&cptr[1]; | |
101 /* cre*re cim*im */ | |
102 t10 = _m_pfmul(c0, b0); | |
103 t11 = _m_pfmul(c1, b1); | |
104 /* no need to access cptr[2] & cptr[3] */ | |
105 c0 = _m_pswapd(c0); | |
106 c1 = _m_pswapd(c1); | |
107 /* cim*re cre*im */ | |
108 t20 = _m_pfmul(c0, b0); | |
109 t21 = _m_pfmul(c1, b1); | |
110 | |
111 /* cre*re-cim*im cim*re+cre*im */ | |
112 b0 = _m_pfpnacc(t10, t20); | |
113 b1 = _m_pfpnacc(t11, t21); | |
114 | |
115 /* butterfly */ | |
116 *(__m64 *)&p[0] = _m_pfadd(a0, b0); | |
117 *(__m64 *)&p[1] = _m_pfadd(a1, b1); | |
118 *(__m64 *)&q[0] = _m_pfsub(a0, b0); | |
119 *(__m64 *)&q[1] = _m_pfsub(a1, b1); | |
120 | |
121 p += 2; | |
122 q += 2; | |
123 cptr += 4; | |
124 } while (--k); | |
125 | |
126 p += nloops; | |
127 q += nloops; | |
128 } while (--j); | |
129 cptr1 += nloops * 2; | |
130 nblocks = nblocks >> 1; | |
131 nloops = nloops << 1; | |
132 } while (nblocks != 0); | |
133 _m_femms(); | |
134 } | |
135 | |
136 #endif | |
3555 | 137 |
138 void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, | |
139 const FFTSample *input, FFTSample *tmp) | |
140 { | |
3559
c02459cd0d31
slightly faster ff_imdct_calc_3dn2() on amd64. (gcc added a bunch of useless movsxd)
lorenm
parents:
3555
diff
changeset
|
141 long k, n8, n4, n2, n; |
3555 | 142 const uint16_t *revtab = s->fft.revtab; |
143 const FFTSample *tcos = s->tcos; | |
144 const FFTSample *tsin = s->tsin; | |
145 const FFTSample *in1, *in2; | |
146 FFTComplex *z = (FFTComplex *)tmp; | |
147 | |
148 n = 1 << s->nbits; | |
149 n2 = n >> 1; | |
150 n4 = n >> 2; | |
151 n8 = n >> 3; | |
152 | |
153 /* pre rotation */ | |
154 in1 = input; | |
155 in2 = input + n2 - 1; | |
156 for(k = 0; k < n4; k++) { | |
3560
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
157 // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it |
3555 | 158 asm volatile( |
3560
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
159 "movd %0, %%mm0 \n\t" |
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
160 "movd %2, %%mm1 \n\t" |
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
161 "punpckldq %1, %%mm0 \n\t" |
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
162 "punpckldq %3, %%mm1 \n\t" |
3555 | 163 "movq %%mm0, %%mm2 \n\t" |
164 "pfmul %%mm1, %%mm0 \n\t" | |
165 "pswapd %%mm1, %%mm1 \n\t" | |
166 "pfmul %%mm1, %%mm2 \n\t" | |
167 "pfpnacc %%mm2, %%mm0 \n\t" | |
3560
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
168 ::"m"(in2[-2*k]), "m"(in1[2*k]), |
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
169 "m"(tcos[k]), "m"(tsin[k]) |
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
170 ); |
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
171 asm volatile( |
3555 | 172 "movq %%mm0, %0 \n\t" |
173 :"=m"(z[revtab[k]]) | |
174 ); | |
175 } | |
176 | |
177 ff_fft_calc(&s->fft, z); | |
178 | |
179 /* post rotation + reordering */ | |
180 for(k = 0; k < n4; k++) { | |
181 asm volatile( | |
182 "movq %0, %%mm0 \n\t" | |
183 "movd %1, %%mm1 \n\t" | |
184 "punpckldq %2, %%mm1 \n\t" | |
185 "movq %%mm0, %%mm2 \n\t" | |
186 "pfmul %%mm1, %%mm0 \n\t" | |
187 "pswapd %%mm1, %%mm1 \n\t" | |
188 "pfmul %%mm1, %%mm2 \n\t" | |
189 "pfpnacc %%mm2, %%mm0 \n\t" | |
190 "movq %%mm0, %0 \n\t" | |
191 :"+m"(z[k]) | |
192 :"m"(tcos[k]), "m"(tsin[k]) | |
193 ); | |
194 } | |
195 | |
3560
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
196 z += n8; |
3555 | 197 asm volatile("movd %0, %%mm7" ::"r"(1<<31)); |
198 for(k = 0; k < n8; k++) { | |
199 asm volatile( | |
3560
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
200 "movq %0, %%mm0 \n\t" |
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
201 "pswapd %1, %%mm1 \n\t" |
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
202 ::"m"(z[k]), "m"(z[-1-k]) |
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
203 ); |
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
204 asm volatile( |
3555 | 205 "movq %%mm0, %%mm2 \n\t" |
206 "pxor %%mm7, %%mm2 \n\t" | |
207 "punpckldq %%mm1, %%mm2 \n\t" | |
208 "pswapd %%mm2, %%mm3 \n\t" | |
209 "punpckhdq %%mm1, %%mm0 \n\t" | |
210 "pswapd %%mm0, %%mm4 \n\t" | |
211 "pxor %%mm7, %%mm0 \n\t" | |
212 "pxor %%mm7, %%mm4 \n\t" | |
213 "movq %%mm0, %0 \n\t" // { -z[n8+k].im, z[n8-1-k].re } | |
214 "movq %%mm4, %1 \n\t" // { -z[n8-1-k].re, z[n8+k].im } | |
215 "movq %%mm2, %2 \n\t" // { -z[n8+k].re, z[n8-1-k].im } | |
216 "movq %%mm3, %3 \n\t" // { z[n8-1-k].im, -z[n8+k].re } | |
217 :"=m"(output[2*k]), "=m"(output[n2-2-2*k]), | |
218 "=m"(output[n2+2*k]), "=m"(output[n-2-2*k]) | |
3560
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
219 ::"memory" |
3555 | 220 ); |
221 } | |
3561 | 222 asm volatile("femms"); |
3555 | 223 } |