Mercurial > libavcodec.hg
annotate i386/fft_sse.c @ 3315:cfd452a6560b libavcodec
h264: faster fill_rectangle()
author | lorenm |
---|---|
date | Sun, 28 May 2006 22:28:08 +0000 |
parents | ab1273ffe275 |
children | a3d97c60ea07 |
rev | line source |
---|---|
781 | 1 /* |
2 * FFT/MDCT transform with SSE optimizations | |
3 * Copyright (c) 2002 Fabrice Bellard. | |
4 * | |
5 * This library is free software; you can redistribute it and/or | |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2 of the License, or (at your option) any later version. | |
9 * | |
10 * This library is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 * Lesser General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU Lesser General Public | |
16 * License along with this library; if not, write to the Free Software | |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
781 | 18 */ |
19 #include "../dsputil.h" | |
20 #include <math.h> | |
21 | |
968
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
22 #ifdef HAVE_BUILTIN_VECTOR |
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
23 |
781 | 24 #include <xmmintrin.h> |
25 | |
3166
ab1273ffe275
use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents:
3036
diff
changeset
|
26 static const int p1p1p1m1[4] __attribute__((aligned(16))) = |
ab1273ffe275
use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents:
3036
diff
changeset
|
27 { 0, 0, 0, 1 << 31 }; |
781 | 28 |
3166
ab1273ffe275
use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents:
3036
diff
changeset
|
29 static const int p1p1m1p1[4] __attribute__((aligned(16))) = |
ab1273ffe275
use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents:
3036
diff
changeset
|
30 { 0, 0, 1 << 31, 0 }; |
968
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
31 |
3166
ab1273ffe275
use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents:
3036
diff
changeset
|
32 static const int p1p1m1m1[4] __attribute__((aligned(16))) = |
ab1273ffe275
use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents:
3036
diff
changeset
|
33 { 0, 0, 1 << 31, 1 << 31 }; |
781 | 34 |
35 #if 0 | |
36 static void print_v4sf(const char *str, __m128 a) | |
37 { | |
38 float *p = (float *)&a; | |
39 printf("%s: %f %f %f %f\n", | |
40 str, p[0], p[1], p[2], p[3]); | |
41 } | |
42 #endif | |
43 | |
44 /* XXX: handle reverse case */ | |
1879
dd63cb7e5080
fft_*() renamed into ff_fft_*() patch by (Gildas Bazin <gbazin at altern dot org>)
michael
parents:
968
diff
changeset
|
45 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z) |
781 | 46 { |
47 int ln = s->nbits; | |
2979 | 48 int j, np, np2; |
49 int nblocks, nloops; | |
781 | 50 register FFTComplex *p, *q; |
51 FFTComplex *cptr, *cptr1; | |
52 int k; | |
53 | |
54 np = 1 << ln; | |
55 | |
56 { | |
57 __m128 *r, a, b, a1, c1, c2; | |
58 | |
59 r = (__m128 *)&z[0]; | |
60 c1 = *(__m128 *)p1p1m1m1; | |
968
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
61 if (s->inverse) |
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
62 c2 = *(__m128 *)p1p1m1p1; |
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
63 else |
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
64 c2 = *(__m128 *)p1p1p1m1; |
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
65 |
781 | 66 j = (np >> 2); |
67 do { | |
68 a = r[0]; | |
69 b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); | |
3166
ab1273ffe275
use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents:
3036
diff
changeset
|
70 a = _mm_xor_ps(a, c1); |
781 | 71 /* do the pass 0 butterfly */ |
72 a = _mm_add_ps(a, b); | |
73 | |
74 a1 = r[1]; | |
75 b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2)); | |
3166
ab1273ffe275
use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents:
3036
diff
changeset
|
76 a1 = _mm_xor_ps(a1, c1); |
781 | 77 /* do the pass 0 butterfly */ |
78 b = _mm_add_ps(a1, b); | |
79 | |
80 /* multiply third by -i */ | |
3166
ab1273ffe275
use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents:
3036
diff
changeset
|
81 /* by toggling the sign bit */ |
781 | 82 b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0)); |
3166
ab1273ffe275
use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents:
3036
diff
changeset
|
83 b = _mm_xor_ps(b, c2); |
781 | 84 |
85 /* do the pass 1 butterfly */ | |
86 r[0] = _mm_add_ps(a, b); | |
87 r[1] = _mm_sub_ps(a, b); | |
88 r += 2; | |
89 } while (--j != 0); | |
90 } | |
91 /* pass 2 .. ln-1 */ | |
92 | |
93 nblocks = np >> 3; | |
94 nloops = 1 << 2; | |
95 np2 = np >> 1; | |
96 | |
97 cptr1 = s->exptab1; | |
98 do { | |
99 p = z; | |
100 q = z + nloops; | |
101 j = nblocks; | |
102 do { | |
103 cptr = cptr1; | |
104 k = nloops >> 1; | |
105 do { | |
106 __m128 a, b, c, t1, t2; | |
107 | |
108 a = *(__m128 *)p; | |
109 b = *(__m128 *)q; | |
2967 | 110 |
781 | 111 /* complex mul */ |
112 c = *(__m128 *)cptr; | |
113 /* cre*re cim*re */ | |
2967 | 114 t1 = _mm_mul_ps(c, |
115 _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 0, 0))); | |
781 | 116 c = *(__m128 *)(cptr + 2); |
117 /* -cim*im cre*im */ | |
118 t2 = _mm_mul_ps(c, | |
2967 | 119 _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 1, 1))); |
781 | 120 b = _mm_add_ps(t1, t2); |
2967 | 121 |
781 | 122 /* butterfly */ |
123 *(__m128 *)p = _mm_add_ps(a, b); | |
124 *(__m128 *)q = _mm_sub_ps(a, b); | |
2967 | 125 |
781 | 126 p += 2; |
127 q += 2; | |
128 cptr += 4; | |
129 } while (--k); | |
2967 | 130 |
781 | 131 p += nloops; |
132 q += nloops; | |
133 } while (--j); | |
134 cptr1 += nloops * 2; | |
135 nblocks = nblocks >> 1; | |
136 nloops = nloops << 1; | |
137 } while (nblocks != 0); | |
138 } | |
968
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
139 |
64f1a11b5f86
added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents:
781
diff
changeset
|
140 #endif |