Mercurial > libavcodec.hg
annotate i386/fft_sse.c @ 1527:8ffd0c00e6df libavcodec
mmx2 optimization of huffyuv median encoding
author | michael |
---|---|
date | Mon, 13 Oct 2003 17:27:30 +0000 |
parents | 64f1a11b5f86 |
children | dd63cb7e5080 |
rev | line source |
---|---|
781 | 1 /* |
2 * FFT/MDCT transform with SSE optimizations | |
3 * Copyright (c) 2002 Fabrice Bellard. | |
4 * | |
5 * This library is free software; you can redistribute it and/or | |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2 of the License, or (at your option) any later version. | |
9 * | |
10 * This library is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 * Lesser General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU Lesser General Public | |
16 * License along with this library; if not, write to the Free Software | |
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
18 */ | |
19 #include "../dsputil.h" | |
20 #include <math.h> | |
21 | |
968
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
22 #ifdef HAVE_BUILTIN_VECTOR |
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
23 |
781 | 24 #include <xmmintrin.h> |
25 | |
26 static const float p1p1p1m1[4] __attribute__((aligned(16))) = | |
27 { 1.0, 1.0, 1.0, -1.0 }; | |
28 | |
968
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
29 static const float p1p1m1p1[4] __attribute__((aligned(16))) = |
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
30 { 1.0, 1.0, -1.0, 1.0 }; |
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
31 |
781 | 32 static const float p1p1m1m1[4] __attribute__((aligned(16))) = |
33 { 1.0, 1.0, -1.0, -1.0 }; | |
34 | |
35 #if 0 | |
36 static void print_v4sf(const char *str, __m128 a) | |
37 { | |
38 float *p = (float *)&a; | |
39 printf("%s: %f %f %f %f\n", | |
40 str, p[0], p[1], p[2], p[3]); | |
41 } | |
42 #endif | |
43 | |
44 /* XXX: handle reverse case */ | |
45 void fft_calc_sse(FFTContext *s, FFTComplex *z) | |
46 { | |
47 int ln = s->nbits; | |
48 int j, np, np2; | |
49 int nblocks, nloops; | |
50 register FFTComplex *p, *q; | |
51 FFTComplex *cptr, *cptr1; | |
52 int k; | |
53 | |
54 np = 1 << ln; | |
55 | |
56 { | |
57 __m128 *r, a, b, a1, c1, c2; | |
58 | |
59 r = (__m128 *)&z[0]; | |
60 c1 = *(__m128 *)p1p1m1m1; | |
61 c2 = *(__m128 *)p1p1p1m1; | |
968
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
62 if (s->inverse) |
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
63 c2 = *(__m128 *)p1p1m1p1; |
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
64 else |
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
65 c2 = *(__m128 *)p1p1p1m1; |
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
66 |
781 | 67 j = (np >> 2); |
68 do { | |
69 a = r[0]; | |
70 b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); | |
71 a = _mm_mul_ps(a, c1); | |
72 /* do the pass 0 butterfly */ | |
73 a = _mm_add_ps(a, b); | |
74 | |
75 a1 = r[1]; | |
76 b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2)); | |
77 a1 = _mm_mul_ps(a1, c1); | |
78 /* do the pass 0 butterfly */ | |
79 b = _mm_add_ps(a1, b); | |
80 | |
81 /* multiply third by -i */ | |
82 b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0)); | |
83 b = _mm_mul_ps(b, c2); | |
84 | |
85 /* do the pass 1 butterfly */ | |
86 r[0] = _mm_add_ps(a, b); | |
87 r[1] = _mm_sub_ps(a, b); | |
88 r += 2; | |
89 } while (--j != 0); | |
90 } | |
91 /* pass 2 .. ln-1 */ | |
92 | |
93 nblocks = np >> 3; | |
94 nloops = 1 << 2; | |
95 np2 = np >> 1; | |
96 | |
97 cptr1 = s->exptab1; | |
98 do { | |
99 p = z; | |
100 q = z + nloops; | |
101 j = nblocks; | |
102 do { | |
103 cptr = cptr1; | |
104 k = nloops >> 1; | |
105 do { | |
106 __m128 a, b, c, t1, t2; | |
107 | |
108 a = *(__m128 *)p; | |
109 b = *(__m128 *)q; | |
110 | |
111 /* complex mul */ | |
112 c = *(__m128 *)cptr; | |
113 /* cre*re cim*re */ | |
114 t1 = _mm_mul_ps(c, | |
115 _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 0, 0))); | |
116 c = *(__m128 *)(cptr + 2); | |
117 /* -cim*im cre*im */ | |
118 t2 = _mm_mul_ps(c, | |
119 _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 1, 1))); | |
120 b = _mm_add_ps(t1, t2); | |
121 | |
122 /* butterfly */ | |
123 *(__m128 *)p = _mm_add_ps(a, b); | |
124 *(__m128 *)q = _mm_sub_ps(a, b); | |
125 | |
126 p += 2; | |
127 q += 2; | |
128 cptr += 4; | |
129 } while (--k); | |
130 | |
131 p += nloops; | |
132 q += nloops; | |
133 } while (--j); | |
134 cptr1 += nloops * 2; | |
135 nblocks = nblocks >> 1; | |
136 nloops = nloops << 1; | |
137 } while (nblocks != 0); | |
138 } | |
968
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
139 |
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
140 #endif |