Mercurial > libavcodec.hg
annotate x86/fft_3dn2.c @ 12454:f4355cd85faa libavcodec
Port latest x264 deblock asm (before they moved to using NV12 as internal
format), LGPL'ed with permission from Jason and Loren. This includes mmx2
code, so remove inline asm from h264dsp_mmx.c accordingly.
author | rbultje |
---|---|
date | Fri, 03 Sep 2010 16:52:46 +0000 |
parents | b64b8e5a2d3a |
children |
rev | line source |
---|---|
8430 | 1 /* |
2 * FFT/MDCT transform with Extended 3DNow! optimizations | |
3 * Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 | |
22 #include "libavutil/x86_cpu.h" | |
23 #include "libavcodec/dsputil.h" | |
10175
5cf49858179a
Move per-arch fft init bits into the corresponding subdirs
mru
parents:
8430
diff
changeset
|
24 #include "fft.h" |
8430 | 25 |
11369 | 26 DECLARE_ALIGNED(8, static const int, m1m1)[2] = { 1<<31, 1<<31 }; |
8430 | 27 |
28 #ifdef EMULATE_3DNOWEXT | |
29 #define PSWAPD(s,d)\ | |
30 "movq "#s","#d"\n"\ | |
31 "psrlq $32,"#d"\n"\ | |
32 "punpckldq "#s","#d"\n" | |
33 #define ff_fft_calc_3dn2 ff_fft_calc_3dn | |
34 #define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn | |
35 #define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn | |
36 #define ff_imdct_calc_3dn2 ff_imdct_calc_3dn | |
37 #define ff_imdct_half_3dn2 ff_imdct_half_3dn | |
38 #else | |
39 #define PSWAPD(s,d) "pswapd "#s","#d"\n" | |
40 #endif | |
41 | |
42 void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits); | |
43 void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits); | |
44 | |
45 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) | |
46 { | |
47 int n = 1<<s->nbits; | |
48 int i; | |
49 ff_fft_dispatch_interleave_3dn2(z, s->nbits); | |
50 __asm__ volatile("femms"); | |
51 if(n <= 8) | |
52 for(i=0; i<n; i+=2) | |
53 FFSWAP(FFTSample, z[i].im, z[i+1].re); | |
54 } | |
55 | |
10199 | 56 void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input) |
8430 | 57 { |
58 x86_reg j, k; | |
12405
b64b8e5a2d3a
imdct/x86: Use "s->mdct_size" instead of "1 << s->mdct_bits".
alexc
parents:
11369
diff
changeset
|
59 long n = s->mdct_size; |
8430 | 60 long n2 = n >> 1; |
61 long n4 = n >> 2; | |
62 long n8 = n >> 3; | |
10199 | 63 const uint16_t *revtab = s->revtab; |
8430 | 64 const FFTSample *tcos = s->tcos; |
65 const FFTSample *tsin = s->tsin; | |
66 const FFTSample *in1, *in2; | |
67 FFTComplex *z = (FFTComplex *)output; | |
68 | |
69 /* pre rotation */ | |
70 in1 = input; | |
71 in2 = input + n2 - 1; | |
72 #ifdef EMULATE_3DNOWEXT | |
73 __asm__ volatile("movd %0, %%mm7" ::"r"(1<<31)); | |
74 #endif | |
75 for(k = 0; k < n4; k++) { | |
76 // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it | |
77 __asm__ volatile( | |
78 "movd %0, %%mm0 \n" | |
79 "movd %2, %%mm1 \n" | |
80 "punpckldq %1, %%mm0 \n" | |
81 "punpckldq %3, %%mm1 \n" | |
82 "movq %%mm0, %%mm2 \n" | |
83 PSWAPD( %%mm1, %%mm3 ) | |
84 "pfmul %%mm1, %%mm0 \n" | |
85 "pfmul %%mm3, %%mm2 \n" | |
86 #ifdef EMULATE_3DNOWEXT | |
87 "movq %%mm0, %%mm1 \n" | |
88 "punpckhdq %%mm2, %%mm0 \n" | |
89 "punpckldq %%mm2, %%mm1 \n" | |
90 "pxor %%mm7, %%mm0 \n" | |
91 "pfadd %%mm1, %%mm0 \n" | |
92 #else | |
93 "pfpnacc %%mm2, %%mm0 \n" | |
94 #endif | |
95 ::"m"(in2[-2*k]), "m"(in1[2*k]), | |
96 "m"(tcos[k]), "m"(tsin[k]) | |
97 ); | |
98 __asm__ volatile( | |
99 "movq %%mm0, %0 \n\t" | |
100 :"=m"(z[revtab[k]]) | |
101 ); | |
102 } | |
103 | |
10199 | 104 ff_fft_dispatch_3dn2(z, s->nbits); |
8430 | 105 |
106 #define CMUL(j,mm0,mm1)\ | |
107 "movq (%2,"#j",2), %%mm6 \n"\ | |
108 "movq 8(%2,"#j",2), "#mm0"\n"\ | |
109 "movq %%mm6, "#mm1"\n"\ | |
110 "movq "#mm0",%%mm7 \n"\ | |
111 "pfmul (%3,"#j"), %%mm6 \n"\ | |
112 "pfmul (%4,"#j"), "#mm0"\n"\ | |
113 "pfmul (%4,"#j"), "#mm1"\n"\ | |
114 "pfmul (%3,"#j"), %%mm7 \n"\ | |
115 "pfsub %%mm6, "#mm0"\n"\ | |
116 "pfadd %%mm7, "#mm1"\n" | |
117 | |
118 /* post rotation */ | |
119 j = -n2; | |
120 k = n2-8; | |
121 __asm__ volatile( | |
122 "1: \n" | |
123 CMUL(%0, %%mm0, %%mm1) | |
124 CMUL(%1, %%mm2, %%mm3) | |
125 "movd %%mm0, (%2,%0,2) \n" | |
126 "movd %%mm1,12(%2,%1,2) \n" | |
127 "movd %%mm2, (%2,%1,2) \n" | |
128 "movd %%mm3,12(%2,%0,2) \n" | |
129 "psrlq $32, %%mm0 \n" | |
130 "psrlq $32, %%mm1 \n" | |
131 "psrlq $32, %%mm2 \n" | |
132 "psrlq $32, %%mm3 \n" | |
133 "movd %%mm0, 8(%2,%0,2) \n" | |
134 "movd %%mm1, 4(%2,%1,2) \n" | |
135 "movd %%mm2, 8(%2,%1,2) \n" | |
136 "movd %%mm3, 4(%2,%0,2) \n" | |
137 "sub $8, %1 \n" | |
138 "add $8, %0 \n" | |
139 "jl 1b \n" | |
140 :"+r"(j), "+r"(k) | |
141 :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) | |
142 :"memory" | |
143 ); | |
144 __asm__ volatile("femms"); | |
145 } | |
146 | |
10199 | 147 void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input) |
8430 | 148 { |
149 x86_reg j, k; | |
12405
b64b8e5a2d3a
imdct/x86: Use "s->mdct_size" instead of "1 << s->mdct_bits".
alexc
parents:
11369
diff
changeset
|
150 long n = s->mdct_size; |
8430 | 151 long n4 = n >> 2; |
152 | |
153 ff_imdct_half_3dn2(s, output+n4, input); | |
154 | |
155 j = -n; | |
156 k = n-8; | |
157 __asm__ volatile( | |
158 "movq %4, %%mm7 \n" | |
159 "1: \n" | |
160 PSWAPD((%2,%1), %%mm0) | |
161 PSWAPD((%3,%0), %%mm1) | |
162 "pxor %%mm7, %%mm0 \n" | |
163 "movq %%mm1, (%3,%1) \n" | |
164 "movq %%mm0, (%2,%0) \n" | |
165 "sub $8, %1 \n" | |
166 "add $8, %0 \n" | |
167 "jl 1b \n" | |
168 :"+r"(j), "+r"(k) | |
169 :"r"(output+n4), "r"(output+n4*3), | |
170 "m"(*m1m1) | |
171 ); | |
172 __asm__ volatile("femms"); | |
173 } | |
174 |