Mercurial > libavcodec.hg
comparison i386/fft_3dn.c @ 3590:a3d97c60ea07 libavcodec
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
2.5% faster fft, 0.5% faster vorbis.
author | lorenm |
---|---|
date | Fri, 18 Aug 2006 23:53:49 +0000 |
parents | c20c181e0eca |
children | c8c591fe26f8 |
comparison
equal
deleted
inserted
replaced
3589:19c437d6aba5 | 3590:a3d97c60ea07 |
---|---|
1 /* | 1 /* |
2 * FFT/MDCT transform with 3DNow! optimizations | 2 * FFT/MDCT transform with 3DNow! optimizations |
3 * Copyright (c) 2006 Zuxy MENG Jie. | 3 * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt |
4 * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard. | 4 * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard. |
5 * | 5 * |
6 * This library is free software; you can redistribute it and/or | 6 * This library is free software; you can redistribute it and/or |
7 * modify it under the terms of the GNU Lesser General Public | 7 * modify it under the terms of the GNU Lesser General Public |
8 * License as published by the Free Software Foundation; either | 8 * License as published by the Free Software Foundation; either |
16 * You should have received a copy of the GNU Lesser General Public | 16 * You should have received a copy of the GNU Lesser General Public |
17 * License along with this library; if not, write to the Free Software | 17 * License along with this library; if not, write to the Free Software |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 */ | 19 */ |
20 #include "../dsputil.h" | 20 #include "../dsputil.h" |
21 #include <math.h> | |
22 | |
23 #ifdef HAVE_MM3DNOW | |
24 | |
25 #include <mm3dnow.h> | |
26 | 21 |
27 static const int p1m1[2] __attribute__((aligned(8))) = | 22 static const int p1m1[2] __attribute__((aligned(8))) = |
28 { 0, 1 << 31 }; | 23 { 0, 1 << 31 }; |
29 | 24 |
30 static const int m1p1[2] __attribute__((aligned(8))) = | 25 static const int m1p1[2] __attribute__((aligned(8))) = |
31 { 1 << 31, 0 }; | 26 { 1 << 31, 0 }; |
32 | 27 |
33 void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z) | 28 void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z) |
34 { | 29 { |
35 int ln = s->nbits; | 30 int ln = s->nbits; |
36 int j, np, np2; | 31 long i, j; |
37 int nblocks, nloops; | 32 long nblocks, nloops; |
38 register FFTComplex *p, *q; | 33 FFTComplex *p, *cptr; |
39 FFTComplex *cptr, *cptr1; | |
40 int k; | |
41 | 34 |
42 np = 1 << ln; | 35 asm volatile( |
43 /* FEMMS not a must here but recommended by AMD */ | 36 /* FEMMS is not a must here but recommended by AMD */ |
44 _m_femms(); | 37 "femms \n\t" |
38 "movq %0, %%mm7 \n\t" | |
39 ::"m"(*(s->inverse ? m1p1 : p1m1)) | |
40 ); | |
45 | 41 |
46 { | 42 i = 8 << ln; |
47 __m64 *r, a0, a1, b0, b1, tmp, c; | 43 asm volatile( |
48 | 44 "1: \n\t" |
49 r = (__m64 *)&z[0]; | 45 "sub $32, %0 \n\t" |
50 if (s->inverse) | 46 "movq (%0,%1), %%mm0 \n\t" |
51 c = *(__m64 *)m1p1; | 47 "movq 16(%0,%1), %%mm1 \n\t" |
52 else | 48 "movq 8(%0,%1), %%mm2 \n\t" |
53 c = *(__m64 *)p1m1; | 49 "movq 24(%0,%1), %%mm3 \n\t" |
54 | 50 "movq %%mm0, %%mm4 \n\t" |
55 j = (np >> 2); | 51 "movq %%mm1, %%mm5 \n\t" |
56 do { | 52 "pfadd %%mm2, %%mm0 \n\t" |
57 /* do the pass 0 butterfly */ | 53 "pfadd %%mm3, %%mm1 \n\t" |
58 a0 = _m_pfadd(r[0], r[1]); | 54 "pfsub %%mm2, %%mm4 \n\t" |
59 a1 = _m_pfsub(r[0], r[1]); | 55 "pfsub %%mm3, %%mm5 \n\t" |
60 | 56 "movq %%mm0, %%mm2 \n\t" |
61 /* do the pass 0 butterfly */ | 57 "punpckldq %%mm5, %%mm6 \n\t" |
62 b0 = _m_pfadd(r[2], r[3]); | 58 "punpckhdq %%mm6, %%mm5 \n\t" |
63 b1 = _m_pfsub(r[2], r[3]); | 59 "movq %%mm4, %%mm3 \n\t" |
64 | 60 "pxor %%mm7, %%mm5 \n\t" |
65 /* multiply third by -i */ | 61 "pfadd %%mm1, %%mm0 \n\t" |
66 tmp = _m_punpckhdq(b1, b1); | 62 "pfadd %%mm5, %%mm4 \n\t" |
67 b1 = _m_punpckldq(b1, b1); | 63 "pfsub %%mm1, %%mm2 \n\t" |
68 b1 = _m_punpckldq(tmp, b1); | 64 "pfsub %%mm5, %%mm3 \n\t" |
69 b1 = _m_pxor(b1, c); | 65 "movq %%mm0, (%0,%1) \n\t" |
70 | 66 "movq %%mm4, 8(%0,%1) \n\t" |
71 /* do the pass 1 butterfly */ | 67 "movq %%mm2, 16(%0,%1) \n\t" |
72 r[0] = _m_pfadd(a0, b0); | 68 "movq %%mm3, 24(%0,%1) \n\t" |
73 r[1] = _m_pfadd(a1, b1); | 69 "jg 1b \n\t" |
74 r[2] = _m_pfsub(a0, b0); | 70 :"+r"(i) |
75 r[3] = _m_pfsub(a1, b1); | 71 :"r"(z) |
76 r += 4; | 72 ); |
77 } while (--j != 0); | |
78 } | |
79 /* pass 2 .. ln-1 */ | 73 /* pass 2 .. ln-1 */ |
80 | 74 |
81 nblocks = np >> 3; | 75 nblocks = 1 << (ln-3); |
82 nloops = 1 << 2; | 76 nloops = 1 << 2; |
83 np2 = np >> 1; | 77 cptr = s->exptab1; |
84 | |
85 cptr1 = s->exptab1; | |
86 do { | 78 do { |
87 p = z; | 79 p = z; |
88 q = z + nloops; | |
89 j = nblocks; | 80 j = nblocks; |
90 do { | 81 do { |
91 cptr = cptr1; | 82 i = nloops*8; |
92 k = nloops >> 1; | 83 asm volatile( |
93 do { | 84 "1: \n\t" |
94 __m64 a0, a1, b0, b1, c0, c1, t10, t11, t20, t21; | 85 "sub $16, %0 \n\t" |
95 | 86 "movq (%1,%0), %%mm0 \n\t" |
96 a0 = *(__m64 *)&p[0]; | 87 "movq 8(%1,%0), %%mm1 \n\t" |
97 a1 = *(__m64 *)&p[1]; | 88 "movq (%2,%0), %%mm2 \n\t" |
98 b0 = *(__m64 *)&q[0]; | 89 "movq 8(%2,%0), %%mm3 \n\t" |
99 b1 = *(__m64 *)&q[1]; | 90 "movq %%mm2, %%mm4 \n\t" |
100 | 91 "movq %%mm3, %%mm5 \n\t" |
101 /* complex mul */ | 92 "punpckldq %%mm2, %%mm2 \n\t" |
102 c0 = *(__m64 *)&cptr[0]; | 93 "punpckldq %%mm3, %%mm3 \n\t" |
103 c1 = *(__m64 *)&cptr[1]; | 94 "punpckhdq %%mm4, %%mm4 \n\t" |
104 /* cre*re cim*re */ | 95 "punpckhdq %%mm5, %%mm5 \n\t" |
105 t10 = _m_pfmul(c0, _m_punpckldq(b0, b0)); | 96 "pfmul (%3,%0,2), %%mm2 \n\t" // cre*re cim*re |
106 t11 = _m_pfmul(c1, _m_punpckldq(b1, b1)); | 97 "pfmul 8(%3,%0,2), %%mm3 \n\t" |
107 c0 = *(__m64 *)&cptr[2]; | 98 "pfmul 16(%3,%0,2), %%mm4 \n\t" // -cim*im cre*im |
108 c1 = *(__m64 *)&cptr[3]; | 99 "pfmul 24(%3,%0,2), %%mm5 \n\t" |
109 /* -cim*im cre*im */ | 100 "pfadd %%mm2, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im |
110 t20 = _m_pfmul(c0, _m_punpckhdq(b0, b0)); | 101 "pfadd %%mm3, %%mm5 \n\t" |
111 t21 = _m_pfmul(c1, _m_punpckhdq(b1, b1)); | 102 "movq %%mm0, %%mm2 \n\t" |
112 b0 = _m_pfadd(t10, t20); | 103 "movq %%mm1, %%mm3 \n\t" |
113 b1 = _m_pfadd(t11, t21); | 104 "pfadd %%mm4, %%mm0 \n\t" |
114 | 105 "pfadd %%mm5, %%mm1 \n\t" |
115 /* butterfly */ | 106 "pfsub %%mm4, %%mm2 \n\t" |
116 *(__m64 *)&p[0] = _m_pfadd(a0, b0); | 107 "pfsub %%mm5, %%mm3 \n\t" |
117 *(__m64 *)&p[1] = _m_pfadd(a1, b1); | 108 "movq %%mm0, (%1,%0) \n\t" |
118 *(__m64 *)&q[0] = _m_pfsub(a0, b0); | 109 "movq %%mm1, 8(%1,%0) \n\t" |
119 *(__m64 *)&q[1] = _m_pfsub(a1, b1); | 110 "movq %%mm2, (%2,%0) \n\t" |
120 | 111 "movq %%mm3, 8(%2,%0) \n\t" |
121 p += 2; | 112 "jg 1b \n\t" |
122 q += 2; | 113 :"+r"(i) |
123 cptr += 4; | 114 :"r"(p), "r"(p + nloops), "r"(cptr) |
124 } while (--k); | 115 ); |
125 | 116 p += nloops*2; |
126 p += nloops; | |
127 q += nloops; | |
128 } while (--j); | 117 } while (--j); |
129 cptr1 += nloops * 2; | 118 cptr += nloops*2; |
130 nblocks = nblocks >> 1; | 119 nblocks >>= 1; |
131 nloops = nloops << 1; | 120 nloops <<= 1; |
132 } while (nblocks != 0); | 121 } while (nblocks != 0); |
133 _m_femms(); | 122 asm volatile("femms"); |
134 } | 123 } |
135 | |
136 #endif |