Mercurial > libavcodec.hg
comparison i386/fft_sse.c @ 781:6f5e87957bcb libavcodec
new generic FFT/MDCT code for audio codecs
author | bellard |
---|---|
date | Mon, 28 Oct 2002 00:34:08 +0000 |
parents | |
children | 64f1a11b5f86 |
comparison
equal
deleted
inserted
replaced
780:a48bb8bc63dd | 781:6f5e87957bcb |
---|---|
1 /* | |
2 * FFT/MDCT transform with SSE optimizations | |
3 * Copyright (c) 2002 Fabrice Bellard. | |
4 * | |
5 * This library is free software; you can redistribute it and/or | |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2 of the License, or (at your option) any later version. | |
9 * | |
10 * This library is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 * Lesser General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU Lesser General Public | |
16 * License along with this library; if not, write to the Free Software | |
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
18 */ | |
19 #include "../dsputil.h" | |
20 #include <math.h> | |
21 | |
22 #include <xmmintrin.h> | |
23 | |
24 static const float p1p1p1m1[4] __attribute__((aligned(16))) = | |
25 { 1.0, 1.0, 1.0, -1.0 }; | |
26 | |
27 static const float p1p1m1m1[4] __attribute__((aligned(16))) = | |
28 { 1.0, 1.0, -1.0, -1.0 }; | |
29 | |
30 #if 0 | |
31 static void print_v4sf(const char *str, __m128 a) | |
32 { | |
33 float *p = (float *)&a; | |
34 printf("%s: %f %f %f %f\n", | |
35 str, p[0], p[1], p[2], p[3]); | |
36 } | |
37 #endif | |
38 | |
39 /* XXX: handle reverse case */ | |
40 void fft_calc_sse(FFTContext *s, FFTComplex *z) | |
41 { | |
42 int ln = s->nbits; | |
43 int j, np, np2; | |
44 int nblocks, nloops; | |
45 register FFTComplex *p, *q; | |
46 FFTComplex *cptr, *cptr1; | |
47 int k; | |
48 | |
49 np = 1 << ln; | |
50 | |
51 { | |
52 __m128 *r, a, b, a1, c1, c2; | |
53 | |
54 r = (__m128 *)&z[0]; | |
55 c1 = *(__m128 *)p1p1m1m1; | |
56 c2 = *(__m128 *)p1p1p1m1; | |
57 j = (np >> 2); | |
58 do { | |
59 a = r[0]; | |
60 b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); | |
61 a = _mm_mul_ps(a, c1); | |
62 /* do the pass 0 butterfly */ | |
63 a = _mm_add_ps(a, b); | |
64 | |
65 a1 = r[1]; | |
66 b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2)); | |
67 a1 = _mm_mul_ps(a1, c1); | |
68 /* do the pass 0 butterfly */ | |
69 b = _mm_add_ps(a1, b); | |
70 | |
71 /* multiply third by -i */ | |
72 b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0)); | |
73 b = _mm_mul_ps(b, c2); | |
74 | |
75 /* do the pass 1 butterfly */ | |
76 r[0] = _mm_add_ps(a, b); | |
77 r[1] = _mm_sub_ps(a, b); | |
78 r += 2; | |
79 } while (--j != 0); | |
80 } | |
81 /* pass 2 .. ln-1 */ | |
82 | |
83 nblocks = np >> 3; | |
84 nloops = 1 << 2; | |
85 np2 = np >> 1; | |
86 | |
87 cptr1 = s->exptab1; | |
88 do { | |
89 p = z; | |
90 q = z + nloops; | |
91 j = nblocks; | |
92 do { | |
93 cptr = cptr1; | |
94 k = nloops >> 1; | |
95 do { | |
96 __m128 a, b, c, t1, t2; | |
97 | |
98 a = *(__m128 *)p; | |
99 b = *(__m128 *)q; | |
100 | |
101 /* complex mul */ | |
102 c = *(__m128 *)cptr; | |
103 /* cre*re cim*re */ | |
104 t1 = _mm_mul_ps(c, | |
105 _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 0, 0))); | |
106 c = *(__m128 *)(cptr + 2); | |
107 /* -cim*im cre*im */ | |
108 t2 = _mm_mul_ps(c, | |
109 _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 1, 1))); | |
110 b = _mm_add_ps(t1, t2); | |
111 | |
112 /* butterfly */ | |
113 *(__m128 *)p = _mm_add_ps(a, b); | |
114 *(__m128 *)q = _mm_sub_ps(a, b); | |
115 | |
116 p += 2; | |
117 q += 2; | |
118 cptr += 4; | |
119 } while (--k); | |
120 | |
121 p += nloops; | |
122 q += nloops; | |
123 } while (--j); | |
124 cptr1 += nloops * 2; | |
125 nblocks = nblocks >> 1; | |
126 nloops = nloops << 1; | |
127 } while (nblocks != 0); | |
128 } |