annotate i386/fft_sse.c @ 7778:e31b0b920475 libavcodec

theoradec: skip decoding of uncoded MV in 4MV code Thusnelda, the new experimental Theora encoder is using this Theora feature that was previously not exploited. fixes issue579
author aurel
date Wed, 03 Sep 2008 00:17:11 +0000
parents 97383e012cb9
children eebc7209c47f
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
1 /*
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
2 * FFT/MDCT transform with SSE optimizations
7544
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
3 * Copyright (c) 2008 Loren Merritt
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
4 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
5 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
6 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
7 * FFmpeg is free software; you can redistribute it and/or
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
8 * modify it under the terms of the GNU Lesser General Public
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
9 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
10 * version 2.1 of the License, or (at your option) any later version.
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
11 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
12 * FFmpeg is distributed in the hope that it will be useful,
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
15 * Lesser General Public License for more details.
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
16 *
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
17 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
18 * License along with FFmpeg; if not, write to the Free Software
3036
0b546eab515d Update licensing information: The FSF changed postal address.
diego
parents: 2979
diff changeset
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
20 */
6763
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 6755
diff changeset
21
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 6755
diff changeset
22 #include "libavutil/x86_cpu.h"
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 6755
diff changeset
23 #include "libavcodec/dsputil.h"
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
24
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
25 static const int m1m1m1m1[4] __attribute__((aligned(16))) =
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
26 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
27
7542
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
28 void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
29 void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
30
1879
dd63cb7e5080 fft_*() renamed into ff_fft_*() patch by (Gildas Bazin <gbazin at altern dot org>)
michael
parents: 968
diff changeset
31 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
32 {
7542
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
33 int n = 1 << s->nbits;
968
64f1a11b5f86 added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents: 781
diff changeset
34
7542
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
35 ff_fft_dispatch_interleave_sse(z, s->nbits);
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
36
7542
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
37 if(n <= 16) {
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
38 x86_reg i = -8*n;
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
39 asm volatile(
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
40 "1: \n"
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
41 "movaps (%0,%1), %%xmm0 \n"
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
42 "movaps %%xmm0, %%xmm1 \n"
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
43 "unpcklps 16(%0,%1), %%xmm0 \n"
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
44 "unpckhps 16(%0,%1), %%xmm1 \n"
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
45 "movaps %%xmm0, (%0,%1) \n"
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
46 "movaps %%xmm1, 16(%0,%1) \n"
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
47 "add $32, %0 \n"
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
48 "jl 1b \n"
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
49 :"+r"(i)
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
50 :"r"(z+n)
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
51 :"memory"
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
52 );
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
53 }
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
54 }
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
55
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
56 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
57 {
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
58 int n = 1 << s->nbits;
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
59 int i;
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
60 for(i=0; i<n; i+=2) {
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
61 asm volatile(
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
62 "movaps %2, %%xmm0 \n"
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
63 "movlps %%xmm0, %0 \n"
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
64 "movhps %%xmm0, %1 \n"
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
65 :"=m"(s->tmp_buf[s->revtab[i]]),
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
66 "=m"(s->tmp_buf[s->revtab[i+1]])
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
67 :"m"(z[i])
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
68 );
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
69 }
a8a8205a9081 split-radix FFT
lorenm
parents: 7263
diff changeset
70 memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
71 }
968
64f1a11b5f86 added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents: 781
diff changeset
72
7544
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
73 void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input)
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
74 {
7544
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
75 av_unused x86_reg i, j, k, l;
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
76 long n = 1 << s->nbits;
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
77 long n2 = n >> 1;
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
78 long n4 = n >> 2;
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
79 long n8 = n >> 3;
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
80 const uint16_t *revtab = s->fft.revtab + n8;
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
81 const FFTSample *tcos = s->tcos;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
82 const FFTSample *tsin = s->tsin;
7544
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
83 FFTComplex *z = (FFTComplex *)output;
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
84
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
85 /* pre rotation */
7544
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
86 for(k=n8-2; k>=0; k-=2) {
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
87 asm volatile(
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
88 "movaps (%2,%1,2), %%xmm0 \n" // { z[k].re, z[k].im, z[k+1].re, z[k+1].im }
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
89 "movaps -16(%2,%0,2), %%xmm1 \n" // { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im }
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
90 "movaps %%xmm0, %%xmm2 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
91 "shufps $0x88, %%xmm1, %%xmm0 \n" // { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re }
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
92 "shufps $0x77, %%xmm2, %%xmm1 \n" // { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im }
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
93 "movlps (%3,%1), %%xmm4 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
94 "movlps (%4,%1), %%xmm5 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
95 "movhps -8(%3,%0), %%xmm4 \n" // { cos[k], cos[k+1], cos[-k-2], cos[-k-1] }
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
96 "movhps -8(%4,%0), %%xmm5 \n" // { sin[k], sin[k+1], sin[-k-2], sin[-k-1] }
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
97 "movaps %%xmm0, %%xmm2 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
98 "movaps %%xmm1, %%xmm3 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
99 "mulps %%xmm5, %%xmm0 \n" // re*sin
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
100 "mulps %%xmm4, %%xmm1 \n" // im*cos
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
101 "mulps %%xmm4, %%xmm2 \n" // re*cos
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
102 "mulps %%xmm5, %%xmm3 \n" // im*sin
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
103 "subps %%xmm0, %%xmm1 \n" // -> re
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
104 "addps %%xmm3, %%xmm2 \n" // -> im
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
105 "movaps %%xmm1, %%xmm0 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
106 "unpcklps %%xmm2, %%xmm1 \n" // { z[k], z[k+1] }
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
107 "unpckhps %%xmm2, %%xmm0 \n" // { z[-k-2], z[-k-1] }
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
108 ::"r"(-4*k), "r"(4*k),
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
109 "r"(input+n4), "r"(tcos+n8), "r"(tsin+n8)
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
110 );
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
111 #ifdef ARCH_X86_64
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
112 // if we have enough regs, don't let gcc make the luts latency-bound
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
113 // but if not, latency is faster than spilling
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
114 asm("movlps %%xmm0, %0 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
115 "movhps %%xmm0, %1 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
116 "movlps %%xmm1, %2 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
117 "movhps %%xmm1, %3 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
118 :"=m"(z[revtab[-k-2]]),
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
119 "=m"(z[revtab[-k-1]]),
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
120 "=m"(z[revtab[ k ]]),
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
121 "=m"(z[revtab[ k+1]])
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
122 );
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
123 #else
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
124 asm("movlps %%xmm0, %0" :"=m"(z[revtab[-k-2]]));
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
125 asm("movhps %%xmm0, %0" :"=m"(z[revtab[-k-1]]));
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
126 asm("movlps %%xmm1, %0" :"=m"(z[revtab[ k ]]));
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
127 asm("movhps %%xmm1, %0" :"=m"(z[revtab[ k+1]]));
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
128 #endif
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
129 }
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
130
7544
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
131 ff_fft_dispatch_sse(z, s->fft.nbits);
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
132
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
133 /* post rotation + reinterleave + reorder */
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
134
7544
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
135 #define CMUL(j,xmm0,xmm1)\
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
136 "movaps (%2,"#j",2), %%xmm6 \n"\
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
137 "movaps 16(%2,"#j",2), "#xmm0"\n"\
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
138 "movaps %%xmm6, "#xmm1"\n"\
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
139 "movaps "#xmm0",%%xmm7 \n"\
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
140 "mulps (%3,"#j"), %%xmm6 \n"\
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
141 "mulps (%4,"#j"), "#xmm0"\n"\
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
142 "mulps (%4,"#j"), "#xmm1"\n"\
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
143 "mulps (%3,"#j"), %%xmm7 \n"\
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
144 "subps %%xmm6, "#xmm0"\n"\
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
145 "addps %%xmm7, "#xmm1"\n"
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
146
7544
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
147 j = -n2;
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
148 k = n2-16;
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
149 asm volatile(
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
150 "1: \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
151 CMUL(%0, %%xmm0, %%xmm1)
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
152 CMUL(%1, %%xmm4, %%xmm5)
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
153 "shufps $0x1b, %%xmm1, %%xmm1 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
154 "shufps $0x1b, %%xmm5, %%xmm5 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
155 "movaps %%xmm4, %%xmm6 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
156 "unpckhps %%xmm1, %%xmm4 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
157 "unpcklps %%xmm1, %%xmm6 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
158 "movaps %%xmm0, %%xmm2 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
159 "unpcklps %%xmm5, %%xmm0 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
160 "unpckhps %%xmm5, %%xmm2 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
161 "movaps %%xmm6, (%2,%1,2) \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
162 "movaps %%xmm4, 16(%2,%1,2) \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
163 "movaps %%xmm0, (%2,%0,2) \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
164 "movaps %%xmm2, 16(%2,%0,2) \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
165 "sub $16, %1 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
166 "add $16, %0 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
167 "jl 1b \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
168 :"+&r"(j), "+&r"(k)
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
169 :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8)
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
170 :"memory"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
171 );
7263
fc843d00867c exploit mdct symmetry
lorenm
parents: 6763
diff changeset
172 }
fc843d00867c exploit mdct symmetry
lorenm
parents: 6763
diff changeset
173
7546
97383e012cb9 remove mdct tmp buffer
lorenm
parents: 7544
diff changeset
174 void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input)
7263
fc843d00867c exploit mdct symmetry
lorenm
parents: 6763
diff changeset
175 {
7544
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
176 x86_reg j, k;
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
177 long n = 1 << s->nbits;
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
178 long n4 = n >> 2;
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
179
7544
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
180 ff_imdct_half_sse(s, output+n4, input);
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
181
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
182 j = -n;
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
183 k = n-16;
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
184 asm volatile(
7544
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
185 "movaps %4, %%xmm7 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
186 "1: \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
187 "movaps (%2,%1), %%xmm0 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
188 "movaps (%3,%0), %%xmm1 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
189 "shufps $0x1b, %%xmm0, %%xmm0 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
190 "shufps $0x1b, %%xmm1, %%xmm1 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
191 "xorps %%xmm7, %%xmm0 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
192 "movaps %%xmm1, (%3,%1) \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
193 "movaps %%xmm0, (%2,%0) \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
194 "sub $16, %1 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
195 "add $16, %0 \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
196 "jl 1b \n"
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
197 :"+r"(j), "+r"(k)
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
198 :"r"(output+n4), "r"(output+n4*3),
ee1cb5ab9f99 optimize imdct_half:
lorenm
parents: 7542
diff changeset
199 "m"(*m1m1m1m1)
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
200 );
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
201 }
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
202