Mercurial > libavcodec.hg
comparison i386/fft_3dn.c @ 3175:c20c181e0eca libavcodec
3DNow! & Extended 3DNow! versions of FFT
Patch by Zuxy Meng, zuxy <<dot>> meng >>at<< gmail <<dot>> com
Minor non-functional diff-related fixes by me.
author | corey |
---|---|
date | Wed, 08 Mar 2006 04:13:55 +0000 |
parents | |
children | a3d97c60ea07 |
comparison
equal
deleted
inserted
replaced
3174:b65cbae9d940 | 3175:c20c181e0eca |
---|---|
1 /* | |
2 * FFT/MDCT transform with 3DNow! optimizations | |
3 * Copyright (c) 2006 Zuxy MENG Jie. | |
4 * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard. | |
5 * | |
6 * This library is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2 of the License, or (at your option) any later version. | |
10 * | |
11 * This library is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with this library; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 */ | |
20 #include "../dsputil.h" | |
21 #include <math.h> | |
22 | |
23 #ifdef HAVE_MM3DNOW | |
24 | |
25 #include <mm3dnow.h> | |
26 | |
27 static const int p1m1[2] __attribute__((aligned(8))) = | |
28 { 0, 1 << 31 }; | |
29 | |
30 static const int m1p1[2] __attribute__((aligned(8))) = | |
31 { 1 << 31, 0 }; | |
32 | |
33 void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z) | |
34 { | |
35 int ln = s->nbits; | |
36 int j, np, np2; | |
37 int nblocks, nloops; | |
38 register FFTComplex *p, *q; | |
39 FFTComplex *cptr, *cptr1; | |
40 int k; | |
41 | |
42 np = 1 << ln; | |
43 /* FEMMS not a must here but recommended by AMD */ | |
44 _m_femms(); | |
45 | |
46 { | |
47 __m64 *r, a0, a1, b0, b1, tmp, c; | |
48 | |
49 r = (__m64 *)&z[0]; | |
50 if (s->inverse) | |
51 c = *(__m64 *)m1p1; | |
52 else | |
53 c = *(__m64 *)p1m1; | |
54 | |
55 j = (np >> 2); | |
56 do { | |
57 /* do the pass 0 butterfly */ | |
58 a0 = _m_pfadd(r[0], r[1]); | |
59 a1 = _m_pfsub(r[0], r[1]); | |
60 | |
61 /* do the pass 0 butterfly */ | |
62 b0 = _m_pfadd(r[2], r[3]); | |
63 b1 = _m_pfsub(r[2], r[3]); | |
64 | |
65 /* multiply third by -i */ | |
66 tmp = _m_punpckhdq(b1, b1); | |
67 b1 = _m_punpckldq(b1, b1); | |
68 b1 = _m_punpckldq(tmp, b1); | |
69 b1 = _m_pxor(b1, c); | |
70 | |
71 /* do the pass 1 butterfly */ | |
72 r[0] = _m_pfadd(a0, b0); | |
73 r[1] = _m_pfadd(a1, b1); | |
74 r[2] = _m_pfsub(a0, b0); | |
75 r[3] = _m_pfsub(a1, b1); | |
76 r += 4; | |
77 } while (--j != 0); | |
78 } | |
79 /* pass 2 .. ln-1 */ | |
80 | |
81 nblocks = np >> 3; | |
82 nloops = 1 << 2; | |
83 np2 = np >> 1; | |
84 | |
85 cptr1 = s->exptab1; | |
86 do { | |
87 p = z; | |
88 q = z + nloops; | |
89 j = nblocks; | |
90 do { | |
91 cptr = cptr1; | |
92 k = nloops >> 1; | |
93 do { | |
94 __m64 a0, a1, b0, b1, c0, c1, t10, t11, t20, t21; | |
95 | |
96 a0 = *(__m64 *)&p[0]; | |
97 a1 = *(__m64 *)&p[1]; | |
98 b0 = *(__m64 *)&q[0]; | |
99 b1 = *(__m64 *)&q[1]; | |
100 | |
101 /* complex mul */ | |
102 c0 = *(__m64 *)&cptr[0]; | |
103 c1 = *(__m64 *)&cptr[1]; | |
104 /* cre*re cim*re */ | |
105 t10 = _m_pfmul(c0, _m_punpckldq(b0, b0)); | |
106 t11 = _m_pfmul(c1, _m_punpckldq(b1, b1)); | |
107 c0 = *(__m64 *)&cptr[2]; | |
108 c1 = *(__m64 *)&cptr[3]; | |
109 /* -cim*im cre*im */ | |
110 t20 = _m_pfmul(c0, _m_punpckhdq(b0, b0)); | |
111 t21 = _m_pfmul(c1, _m_punpckhdq(b1, b1)); | |
112 b0 = _m_pfadd(t10, t20); | |
113 b1 = _m_pfadd(t11, t21); | |
114 | |
115 /* butterfly */ | |
116 *(__m64 *)&p[0] = _m_pfadd(a0, b0); | |
117 *(__m64 *)&p[1] = _m_pfadd(a1, b1); | |
118 *(__m64 *)&q[0] = _m_pfsub(a0, b0); | |
119 *(__m64 *)&q[1] = _m_pfsub(a1, b1); | |
120 | |
121 p += 2; | |
122 q += 2; | |
123 cptr += 4; | |
124 } while (--k); | |
125 | |
126 p += nloops; | |
127 q += nloops; | |
128 } while (--j); | |
129 cptr1 += nloops * 2; | |
130 nblocks = nblocks >> 1; | |
131 nloops = nloops << 1; | |
132 } while (nblocks != 0); | |
133 _m_femms(); | |
134 } | |
135 | |
136 #endif |