Mercurial > libavcodec.hg
annotate i386/fft_3dn2.c @ 6920:d02af7474bff libavcodec
Prevent 128*1<<trellis from becoming 0 and creating 0 sized arrays.
fixes CID84 RUN2
CID85 RUN2
CID86 RUN2
CID87 RUN2
CID88 RUN2
CID89 RUN2
CID90 RUN2
CID91 RUN2
CID92 RUN2
CID93 RUN2
CID94 RUN2
CID95 RUN2
CID96 RUN2
CID97 RUN2
CID98 RUN2
CID99 RUN2
CID100 RUN2
CID101 RUN2
CID102 RUN2
CID103 RUN2
CID104 RUN2
CID105 RUN2
CID106 RUN2
author | michael |
---|---|
date | Wed, 28 May 2008 11:59:41 +0000 |
parents | f7cbb7733146 |
children | fc843d00867c |
rev | line source |
---|---|
3175 | 1 /* |
2 * FFT/MDCT transform with Extended 3DNow! optimizations | |
3555 | 3 * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt |
3175 | 4 * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard. |
5 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3748
diff
changeset
|
6 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3748
diff
changeset
|
7 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3748
diff
changeset
|
8 * FFmpeg is free software; you can redistribute it and/or |
3175 | 9 * modify it under the terms of the GNU Lesser General Public |
10 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3748
diff
changeset
|
11 * version 2.1 of the License, or (at your option) any later version. |
3175 | 12 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3748
diff
changeset
|
13 * FFmpeg is distributed in the hope that it will be useful, |
3175 | 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 * Lesser General Public License for more details. | |
17 * | |
18 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3748
diff
changeset
|
19 * License along with FFmpeg; if not, write to the Free Software |
3175 | 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
21 */ | |
6763 | 22 |
23 #include "libavutil/x86_cpu.h" | |
24 #include "libavcodec/dsputil.h" | |
3175 | 25 |
26 static const int p1m1[2] __attribute__((aligned(8))) = | |
27 { 0, 1 << 31 }; | |
28 | |
29 static const int m1p1[2] __attribute__((aligned(8))) = | |
30 { 1 << 31, 0 }; | |
31 | |
32 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) | |
33 { | |
34 int ln = s->nbits; | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
5010
diff
changeset
|
35 long j; |
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
5010
diff
changeset
|
36 x86_reg i; |
3590
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
37 long nblocks, nloops; |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
38 FFTComplex *p, *cptr; |
3175 | 39 |
3590
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
40 asm volatile( |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
41 /* FEMMS is not a must here but recommended by AMD */ |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
42 "femms \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
43 "movq %0, %%mm7 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
44 ::"m"(*(s->inverse ? m1p1 : p1m1)) |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
45 ); |
3175 | 46 |
3590
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
47 i = 8 << ln; |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
48 asm volatile( |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
49 "1: \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
50 "sub $32, %0 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
51 "movq (%0,%1), %%mm0 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
52 "movq 16(%0,%1), %%mm1 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
53 "movq 8(%0,%1), %%mm2 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
54 "movq 24(%0,%1), %%mm3 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
55 "movq %%mm0, %%mm4 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
56 "movq %%mm1, %%mm5 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
57 "pfadd %%mm2, %%mm0 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
58 "pfadd %%mm3, %%mm1 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
59 "pfsub %%mm2, %%mm4 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
60 "pfsub %%mm3, %%mm5 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
61 "movq %%mm0, %%mm2 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
62 "pswapd %%mm5, %%mm5 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
63 "movq %%mm4, %%mm3 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
64 "pxor %%mm7, %%mm5 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
65 "pfadd %%mm1, %%mm0 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
66 "pfadd %%mm5, %%mm4 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
67 "pfsub %%mm1, %%mm2 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
68 "pfsub %%mm5, %%mm3 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
69 "movq %%mm0, (%0,%1) \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
70 "movq %%mm4, 8(%0,%1) \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
71 "movq %%mm2, 16(%0,%1) \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
72 "movq %%mm3, 24(%0,%1) \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
73 "jg 1b \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
74 :"+r"(i) |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
75 :"r"(z) |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
76 ); |
3175 | 77 /* pass 2 .. ln-1 */ |
78 | |
3590
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
79 nblocks = 1 << (ln-3); |
3175 | 80 nloops = 1 << 2; |
3590
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
81 cptr = s->exptab1; |
3175 | 82 do { |
83 p = z; | |
84 j = nblocks; | |
85 do { | |
3590
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
86 i = nloops*8; |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
87 asm volatile( |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
88 "1: \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
89 "sub $16, %0 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
90 "movq (%1,%0), %%mm0 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
91 "movq 8(%1,%0), %%mm1 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
92 "movq (%2,%0), %%mm2 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
93 "movq 8(%2,%0), %%mm3 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
94 "movq (%3,%0,2), %%mm4 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
95 "movq 8(%3,%0,2), %%mm5 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
96 "pswapd %%mm4, %%mm6 \n\t" // no need for cptr[2] & cptr[3] |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
97 "pswapd %%mm5, %%mm7 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
98 "pfmul %%mm2, %%mm4 \n\t" // cre*re cim*im |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
99 "pfmul %%mm3, %%mm5 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
100 "pfmul %%mm2, %%mm6 \n\t" // cim*re cre*im |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
101 "pfmul %%mm3, %%mm7 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
102 "pfpnacc %%mm6, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
103 "pfpnacc %%mm7, %%mm5 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
104 "movq %%mm0, %%mm2 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
105 "movq %%mm1, %%mm3 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
106 "pfadd %%mm4, %%mm0 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
107 "pfadd %%mm5, %%mm1 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
108 "pfsub %%mm4, %%mm2 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
109 "pfsub %%mm5, %%mm3 \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
110 "movq %%mm0, (%1,%0) \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
111 "movq %%mm1, 8(%1,%0) \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
112 "movq %%mm2, (%2,%0) \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
113 "movq %%mm3, 8(%2,%0) \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
114 "jg 1b \n\t" |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
115 :"+r"(i) |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
116 :"r"(p), "r"(p + nloops), "r"(cptr) |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
117 ); |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
118 p += nloops*2; |
3175 | 119 } while (--j); |
3590
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
120 cptr += nloops*2; |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
121 nblocks >>= 1; |
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
122 nloops <<= 1; |
3175 | 123 } while (nblocks != 0); |
3590
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
124 asm volatile("femms"); |
3175 | 125 } |
126 | |
3555 | 127 void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, |
128 const FFTSample *input, FFTSample *tmp) | |
129 { | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
5010
diff
changeset
|
130 long n8, n4, n2, n; |
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
5010
diff
changeset
|
131 x86_reg k; |
3555 | 132 const uint16_t *revtab = s->fft.revtab; |
133 const FFTSample *tcos = s->tcos; | |
134 const FFTSample *tsin = s->tsin; | |
135 const FFTSample *in1, *in2; | |
136 FFTComplex *z = (FFTComplex *)tmp; | |
137 | |
138 n = 1 << s->nbits; | |
139 n2 = n >> 1; | |
140 n4 = n >> 2; | |
141 n8 = n >> 3; | |
142 | |
143 /* pre rotation */ | |
144 in1 = input; | |
145 in2 = input + n2 - 1; | |
146 for(k = 0; k < n4; k++) { | |
3560
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
147 // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it |
3555 | 148 asm volatile( |
3560
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
149 "movd %0, %%mm0 \n\t" |
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
150 "movd %2, %%mm1 \n\t" |
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
151 "punpckldq %1, %%mm0 \n\t" |
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
152 "punpckldq %3, %%mm1 \n\t" |
3555 | 153 "movq %%mm0, %%mm2 \n\t" |
154 "pfmul %%mm1, %%mm0 \n\t" | |
155 "pswapd %%mm1, %%mm1 \n\t" | |
156 "pfmul %%mm1, %%mm2 \n\t" | |
157 "pfpnacc %%mm2, %%mm0 \n\t" | |
3560
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
158 ::"m"(in2[-2*k]), "m"(in1[2*k]), |
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
159 "m"(tcos[k]), "m"(tsin[k]) |
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
160 ); |
f1a16d793fc5
gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents:
3559
diff
changeset
|
161 asm volatile( |
3555 | 162 "movq %%mm0, %0 \n\t" |
163 :"=m"(z[revtab[k]]) | |
164 ); | |
165 } | |
166 | |
167 ff_fft_calc(&s->fft, z); | |
168 | |
169 /* post rotation + reordering */ | |
170 for(k = 0; k < n4; k++) { | |
171 asm volatile( | |
172 "movq %0, %%mm0 \n\t" | |
173 "movd %1, %%mm1 \n\t" | |
174 "punpckldq %2, %%mm1 \n\t" | |
175 "movq %%mm0, %%mm2 \n\t" | |
176 "pfmul %%mm1, %%mm0 \n\t" | |
177 "pswapd %%mm1, %%mm1 \n\t" | |
178 "pfmul %%mm1, %%mm2 \n\t" | |
179 "pfpnacc %%mm2, %%mm0 \n\t" | |
180 "movq %%mm0, %0 \n\t" | |
181 :"+m"(z[k]) | |
182 :"m"(tcos[k]), "m"(tsin[k]) | |
183 ); | |
184 } | |
185 | |
3747 | 186 k = n-8; |
3555 | 187 asm volatile("movd %0, %%mm7" ::"r"(1<<31)); |
3747 | 188 asm volatile( |
3748 | 189 "1: \n\t" |
190 "movq (%4,%0), %%mm0 \n\t" // z[n8+k] | |
191 "neg %0 \n\t" | |
192 "pswapd -8(%4,%0), %%mm1 \n\t" // z[n8-1-k] | |
193 "movq %%mm0, %%mm2 \n\t" | |
194 "pxor %%mm7, %%mm2 \n\t" | |
195 "punpckldq %%mm1, %%mm2 \n\t" | |
196 "pswapd %%mm2, %%mm3 \n\t" | |
197 "punpckhdq %%mm1, %%mm0 \n\t" | |
198 "pswapd %%mm0, %%mm4 \n\t" | |
199 "pxor %%mm7, %%mm0 \n\t" | |
200 "pxor %%mm7, %%mm4 \n\t" | |
201 "movq %%mm3, -8(%3,%0) \n\t" // output[n-2-2*k] = { z[n8-1-k].im, -z[n8+k].re } | |
202 "movq %%mm4, -8(%2,%0) \n\t" // output[n2-2-2*k]= { -z[n8-1-k].re, z[n8+k].im } | |
203 "neg %0 \n\t" | |
204 "movq %%mm0, (%1,%0) \n\t" // output[2*k] = { -z[n8+k].im, z[n8-1-k].re } | |
205 "movq %%mm2, (%2,%0) \n\t" // output[n2+2*k] = { -z[n8+k].re, z[n8-1-k].im } | |
206 "sub $8, %0 \n\t" | |
207 "jge 1b \n\t" | |
3747 | 208 :"+r"(k) |
209 :"r"(output), "r"(output+n2), "r"(output+n), "r"(z+n8) | |
210 :"memory" | |
211 ); | |
3561 | 212 asm volatile("femms"); |
3555 | 213 } |
3590
a3d97c60ea07
ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents:
3561
diff
changeset
|
214 |