Mercurial > libavcodec.hg
comparison i386/fft_sse.c @ 8031:eebc7209c47f libavcodec
Convert asm keyword into __asm__.
Neither the asm() nor the __asm__() keyword is part of the C99
standard, but while GCC accepts the former in C89 syntax, it is not
accepted in C99 unless GNU extensions are turned on (with -fasm). The
latter form is accepted in any syntax as an extension (without
requiring further command-line options).
Sun Studio C99 compiler also does not accept asm() while accepting
__asm__(), albeit reporting warnings that it's not valid C99 syntax.
author | flameeyes |
---|---|
date | Thu, 16 Oct 2008 13:34:09 +0000 |
parents | 97383e012cb9 |
children |
comparison
equal
deleted
inserted
replaced
8030:a512ac8fa540 | 8031:eebc7209c47f |
---|---|
34 | 34 |
35 ff_fft_dispatch_interleave_sse(z, s->nbits); | 35 ff_fft_dispatch_interleave_sse(z, s->nbits); |
36 | 36 |
37 if(n <= 16) { | 37 if(n <= 16) { |
38 x86_reg i = -8*n; | 38 x86_reg i = -8*n; |
39 asm volatile( | 39 __asm__ volatile( |
40 "1: \n" | 40 "1: \n" |
41 "movaps (%0,%1), %%xmm0 \n" | 41 "movaps (%0,%1), %%xmm0 \n" |
42 "movaps %%xmm0, %%xmm1 \n" | 42 "movaps %%xmm0, %%xmm1 \n" |
43 "unpcklps 16(%0,%1), %%xmm0 \n" | 43 "unpcklps 16(%0,%1), %%xmm0 \n" |
44 "unpckhps 16(%0,%1), %%xmm1 \n" | 44 "unpckhps 16(%0,%1), %%xmm1 \n" |
56 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z) | 56 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z) |
57 { | 57 { |
58 int n = 1 << s->nbits; | 58 int n = 1 << s->nbits; |
59 int i; | 59 int i; |
60 for(i=0; i<n; i+=2) { | 60 for(i=0; i<n; i+=2) { |
61 asm volatile( | 61 __asm__ volatile( |
62 "movaps %2, %%xmm0 \n" | 62 "movaps %2, %%xmm0 \n" |
63 "movlps %%xmm0, %0 \n" | 63 "movlps %%xmm0, %0 \n" |
64 "movhps %%xmm0, %1 \n" | 64 "movhps %%xmm0, %1 \n" |
65 :"=m"(s->tmp_buf[s->revtab[i]]), | 65 :"=m"(s->tmp_buf[s->revtab[i]]), |
66 "=m"(s->tmp_buf[s->revtab[i+1]]) | 66 "=m"(s->tmp_buf[s->revtab[i+1]]) |
82 const FFTSample *tsin = s->tsin; | 82 const FFTSample *tsin = s->tsin; |
83 FFTComplex *z = (FFTComplex *)output; | 83 FFTComplex *z = (FFTComplex *)output; |
84 | 84 |
85 /* pre rotation */ | 85 /* pre rotation */ |
86 for(k=n8-2; k>=0; k-=2) { | 86 for(k=n8-2; k>=0; k-=2) { |
87 asm volatile( | 87 __asm__ volatile( |
88 "movaps (%2,%1,2), %%xmm0 \n" // { z[k].re, z[k].im, z[k+1].re, z[k+1].im } | 88 "movaps (%2,%1,2), %%xmm0 \n" // { z[k].re, z[k].im, z[k+1].re, z[k+1].im } |
89 "movaps -16(%2,%0,2), %%xmm1 \n" // { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } | 89 "movaps -16(%2,%0,2), %%xmm1 \n" // { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } |
90 "movaps %%xmm0, %%xmm2 \n" | 90 "movaps %%xmm0, %%xmm2 \n" |
91 "shufps $0x88, %%xmm1, %%xmm0 \n" // { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } | 91 "shufps $0x88, %%xmm1, %%xmm0 \n" // { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } |
92 "shufps $0x77, %%xmm2, %%xmm1 \n" // { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } | 92 "shufps $0x77, %%xmm2, %%xmm1 \n" // { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } |
109 "r"(input+n4), "r"(tcos+n8), "r"(tsin+n8) | 109 "r"(input+n4), "r"(tcos+n8), "r"(tsin+n8) |
110 ); | 110 ); |
111 #ifdef ARCH_X86_64 | 111 #ifdef ARCH_X86_64 |
112 // if we have enough regs, don't let gcc make the luts latency-bound | 112 // if we have enough regs, don't let gcc make the luts latency-bound |
113 // but if not, latency is faster than spilling | 113 // but if not, latency is faster than spilling |
114 asm("movlps %%xmm0, %0 \n" | 114 __asm__("movlps %%xmm0, %0 \n" |
115 "movhps %%xmm0, %1 \n" | 115 "movhps %%xmm0, %1 \n" |
116 "movlps %%xmm1, %2 \n" | 116 "movlps %%xmm1, %2 \n" |
117 "movhps %%xmm1, %3 \n" | 117 "movhps %%xmm1, %3 \n" |
118 :"=m"(z[revtab[-k-2]]), | 118 :"=m"(z[revtab[-k-2]]), |
119 "=m"(z[revtab[-k-1]]), | 119 "=m"(z[revtab[-k-1]]), |
120 "=m"(z[revtab[ k ]]), | 120 "=m"(z[revtab[ k ]]), |
121 "=m"(z[revtab[ k+1]]) | 121 "=m"(z[revtab[ k+1]]) |
122 ); | 122 ); |
123 #else | 123 #else |
124 asm("movlps %%xmm0, %0" :"=m"(z[revtab[-k-2]])); | 124 __asm__("movlps %%xmm0, %0" :"=m"(z[revtab[-k-2]])); |
125 asm("movhps %%xmm0, %0" :"=m"(z[revtab[-k-1]])); | 125 __asm__("movhps %%xmm0, %0" :"=m"(z[revtab[-k-1]])); |
126 asm("movlps %%xmm1, %0" :"=m"(z[revtab[ k ]])); | 126 __asm__("movlps %%xmm1, %0" :"=m"(z[revtab[ k ]])); |
127 asm("movhps %%xmm1, %0" :"=m"(z[revtab[ k+1]])); | 127 __asm__("movhps %%xmm1, %0" :"=m"(z[revtab[ k+1]])); |
128 #endif | 128 #endif |
129 } | 129 } |
130 | 130 |
131 ff_fft_dispatch_sse(z, s->fft.nbits); | 131 ff_fft_dispatch_sse(z, s->fft.nbits); |
132 | 132 |
144 "subps %%xmm6, "#xmm0"\n"\ | 144 "subps %%xmm6, "#xmm0"\n"\ |
145 "addps %%xmm7, "#xmm1"\n" | 145 "addps %%xmm7, "#xmm1"\n" |
146 | 146 |
147 j = -n2; | 147 j = -n2; |
148 k = n2-16; | 148 k = n2-16; |
149 asm volatile( | 149 __asm__ volatile( |
150 "1: \n" | 150 "1: \n" |
151 CMUL(%0, %%xmm0, %%xmm1) | 151 CMUL(%0, %%xmm0, %%xmm1) |
152 CMUL(%1, %%xmm4, %%xmm5) | 152 CMUL(%1, %%xmm4, %%xmm5) |
153 "shufps $0x1b, %%xmm1, %%xmm1 \n" | 153 "shufps $0x1b, %%xmm1, %%xmm1 \n" |
154 "shufps $0x1b, %%xmm5, %%xmm5 \n" | 154 "shufps $0x1b, %%xmm5, %%xmm5 \n" |
179 | 179 |
180 ff_imdct_half_sse(s, output+n4, input); | 180 ff_imdct_half_sse(s, output+n4, input); |
181 | 181 |
182 j = -n; | 182 j = -n; |
183 k = n-16; | 183 k = n-16; |
184 asm volatile( | 184 __asm__ volatile( |
185 "movaps %4, %%xmm7 \n" | 185 "movaps %4, %%xmm7 \n" |
186 "1: \n" | 186 "1: \n" |
187 "movaps (%2,%1), %%xmm0 \n" | 187 "movaps (%2,%1), %%xmm0 \n" |
188 "movaps (%3,%0), %%xmm1 \n" | 188 "movaps (%3,%0), %%xmm1 \n" |
189 "shufps $0x1b, %%xmm0, %%xmm0 \n" | 189 "shufps $0x1b, %%xmm0, %%xmm0 \n" |