annotate i386/fft_sse.c @ 4889:beeb03aad909 libavcodec

patch so that the deprecated items show up correctly when building doxygen docs patch by mark cox melbournemark plus ffmpeg minus devel chez gmail dot com
author benoit
date Wed, 02 May 2007 09:13:47 +0000
parents c8c591fe26f8
children 743a8b12b7de
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
1 /*
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
2 * FFT/MDCT transform with SSE optimizations
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
3 * Copyright (c) 2002 Fabrice Bellard.
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
4 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
5 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
6 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
7 * FFmpeg is free software; you can redistribute it and/or
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
8 * modify it under the terms of the GNU Lesser General Public
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
9 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
10 * version 2.1 of the License, or (at your option) any later version.
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
11 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
12 * FFmpeg is distributed in the hope that it will be useful,
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
15 * Lesser General Public License for more details.
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
16 *
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
17 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
18 * License along with FFmpeg; if not, write to the Free Software
3036
0b546eab515d Update licensing information: The FSF changed postal address.
diego
parents: 2979
diff changeset
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
20 */
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
21 #include "../dsputil.h"
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
22
3166
ab1273ffe275 use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents: 3036
diff changeset
23 static const int p1p1p1m1[4] __attribute__((aligned(16))) =
ab1273ffe275 use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents: 3036
diff changeset
24 { 0, 0, 0, 1 << 31 };
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
25
3166
ab1273ffe275 use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents: 3036
diff changeset
26 static const int p1p1m1p1[4] __attribute__((aligned(16))) =
ab1273ffe275 use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents: 3036
diff changeset
27 { 0, 0, 1 << 31, 0 };
968
64f1a11b5f86 added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents: 781
diff changeset
28
3166
ab1273ffe275 use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents: 3036
diff changeset
29 static const int p1p1m1m1[4] __attribute__((aligned(16))) =
ab1273ffe275 use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents: 3036
diff changeset
30 { 0, 0, 1 << 31, 1 << 31 };
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
31
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
32 static const int p1m1p1m1[4] __attribute__((aligned(16))) =
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
33 { 0, 1 << 31, 0, 1 << 31 };
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
34
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
35 static const int m1m1m1m1[4] __attribute__((aligned(16))) =
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
36 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
37
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
38 #if 0
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
39 static void print_v4sf(const char *str, __m128 a)
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
40 {
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
41 float *p = (float *)&a;
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
42 printf("%s: %f %f %f %f\n",
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
43 str, p[0], p[1], p[2], p[3]);
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
44 }
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
45 #endif
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
46
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
47 /* XXX: handle reverse case */
1879
dd63cb7e5080 fft_*() renamed into ff_fft_*() patch by (Gildas Bazin <gbazin at altern dot org>)
michael
parents: 968
diff changeset
48 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
49 {
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
50 int ln = s->nbits;
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
51 long i, j;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
52 long nblocks, nloops;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
53 FFTComplex *p, *cptr;
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
54
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
55 asm volatile(
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
56 "movaps %0, %%xmm4 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
57 "movaps %1, %%xmm5 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
58 ::"m"(*p1p1m1m1),
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
59 "m"(*(s->inverse ? p1p1m1p1 : p1p1p1m1))
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
60 );
968
64f1a11b5f86 added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents: 781
diff changeset
61
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
62 i = 8 << ln;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
63 asm volatile(
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
64 "1: \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
65 "sub $32, %0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
66 /* do the pass 0 butterfly */
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
67 "movaps (%0,%1), %%xmm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
68 "movaps %%xmm0, %%xmm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
69 "shufps $0x4E, %%xmm0, %%xmm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
70 "xorps %%xmm4, %%xmm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
71 "addps %%xmm1, %%xmm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
72 "movaps 16(%0,%1), %%xmm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
73 "movaps %%xmm2, %%xmm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
74 "shufps $0x4E, %%xmm2, %%xmm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
75 "xorps %%xmm4, %%xmm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
76 "addps %%xmm3, %%xmm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
77 /* multiply third by -i */
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
78 /* by toggling the sign bit */
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
79 "shufps $0xB4, %%xmm2, %%xmm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
80 "xorps %%xmm5, %%xmm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
81 /* do the pass 1 butterfly */
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
82 "movaps %%xmm0, %%xmm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
83 "addps %%xmm2, %%xmm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
84 "subps %%xmm2, %%xmm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
85 "movaps %%xmm0, (%0,%1) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
86 "movaps %%xmm1, 16(%0,%1) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
87 "jg 1b \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
88 :"+r"(i)
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
89 :"r"(z)
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
90 );
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
91 /* pass 2 .. ln-1 */
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
92
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
93 nblocks = 1 << (ln-3);
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
94 nloops = 1 << 2;
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
95 cptr = s->exptab1;
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
96 do {
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
97 p = z;
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
98 j = nblocks;
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
99 do {
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
100 i = nloops*8;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
101 asm volatile(
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
102 "1: \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
103 "sub $16, %0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
104 "movaps (%2,%0), %%xmm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
105 "movaps (%1,%0), %%xmm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
106 "movaps %%xmm1, %%xmm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
107 "shufps $0xA0, %%xmm1, %%xmm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
108 "shufps $0xF5, %%xmm2, %%xmm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
109 "mulps (%3,%0,2), %%xmm1 \n\t" // cre*re cim*re
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
110 "mulps 16(%3,%0,2), %%xmm2 \n\t" // -cim*im cre*im
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
111 "addps %%xmm2, %%xmm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
112 "movaps %%xmm0, %%xmm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
113 "addps %%xmm1, %%xmm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
114 "subps %%xmm1, %%xmm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
115 "movaps %%xmm0, (%1,%0) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
116 "movaps %%xmm3, (%2,%0) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
117 "jg 1b \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
118 :"+r"(i)
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
119 :"r"(p), "r"(p + nloops), "r"(cptr)
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
120 );
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
121 p += nloops*2;
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
122 } while (--j);
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
123 cptr += nloops*2;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
124 nblocks >>= 1;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
125 nloops <<= 1;
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
126 } while (nblocks != 0);
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
127 }
968
64f1a11b5f86 added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents: 781
diff changeset
128
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
129 void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
130 const FFTSample *input, FFTSample *tmp)
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
131 {
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
132 long k, n8, n4, n2, n;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
133 const uint16_t *revtab = s->fft.revtab;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
134 const FFTSample *tcos = s->tcos;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
135 const FFTSample *tsin = s->tsin;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
136 const FFTSample *in1, *in2;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
137 FFTComplex *z = (FFTComplex *)tmp;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
138
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
139 n = 1 << s->nbits;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
140 n2 = n >> 1;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
141 n4 = n >> 2;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
142 n8 = n >> 3;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
143
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
144 asm volatile ("movaps %0, %%xmm7\n\t"::"m"(*p1m1p1m1));
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
145
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
146 /* pre rotation */
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
147 in1 = input;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
148 in2 = input + n2 - 4;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
149
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
150 /* Complex multiplication
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
151 Two complex products per iteration, we could have 4 with 8 xmm
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
152 registers, 8 with 16 xmm registers.
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
153 Maybe we should unroll more.
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
154 */
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
155 for (k = 0; k < n4; k += 2) {
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
156 asm volatile (
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
157 "movaps %0, %%xmm0 \n\t" // xmm0 = r0 X r1 X : in2
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
158 "movaps %1, %%xmm3 \n\t" // xmm3 = X i1 X i0: in1
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
159 "movlps %2, %%xmm1 \n\t" // xmm1 = X X R1 R0: tcos
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
160 "movlps %3, %%xmm2 \n\t" // xmm2 = X X I1 I0: tsin
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
161 "shufps $95, %%xmm0, %%xmm0 \n\t" // xmm0 = r1 r1 r0 r0
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
162 "shufps $160,%%xmm3, %%xmm3 \n\t" // xmm3 = i1 i1 i0 i0
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
163 "unpcklps %%xmm2, %%xmm1 \n\t" // xmm1 = I1 R1 I0 R0
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
164 "movaps %%xmm1, %%xmm2 \n\t" // xmm2 = I1 R1 I0 R0
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
165 "xorps %%xmm7, %%xmm2 \n\t" // xmm2 = -I1 R1 -I0 R0
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
166 "mulps %%xmm1, %%xmm0 \n\t" // xmm0 = rI rR rI rR
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
167 "shufps $177,%%xmm2, %%xmm2 \n\t" // xmm2 = R1 -I1 R0 -I0
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
168 "mulps %%xmm2, %%xmm3 \n\t" // xmm3 = Ri -Ii Ri -Ii
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
169 "addps %%xmm3, %%xmm0 \n\t" // xmm0 = result
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
170 ::"m"(in2[-2*k]), "m"(in1[2*k]),
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
171 "m"(tcos[k]), "m"(tsin[k])
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
172 );
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
173 /* Should be in the same block, hack for gcc2.95 & gcc3 */
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
174 asm (
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
175 "movlps %%xmm0, %0 \n\t"
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
176 "movhps %%xmm0, %1 \n\t"
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
177 :"=m"(z[revtab[k]]), "=m"(z[revtab[k + 1]])
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
178 );
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
179 }
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
180
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
181 ff_fft_calc_sse(&s->fft, z);
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
182
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
183 /* Not currently needed, added for safety */
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
184 asm volatile ("movaps %0, %%xmm7\n\t"::"m"(*p1m1p1m1));
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
185
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
186 /* post rotation + reordering */
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
187 for (k = 0; k < n4; k += 2) {
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
188 asm (
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
189 "movaps %0, %%xmm0 \n\t" // xmm0 = i1 r1 i0 r0: z
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
190 "movlps %1, %%xmm1 \n\t" // xmm1 = X X R1 R0: tcos
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
191 "movaps %%xmm0, %%xmm3 \n\t" // xmm3 = i1 r1 i0 r0
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
192 "movlps %2, %%xmm2 \n\t" // xmm2 = X X I1 I0: tsin
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
193 "shufps $160,%%xmm0, %%xmm0 \n\t" // xmm0 = r1 r1 r0 r0
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
194 "shufps $245,%%xmm3, %%xmm3 \n\t" // xmm3 = i1 i1 i0 i0
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
195 "unpcklps %%xmm2, %%xmm1 \n\t" // xmm1 = I1 R1 I0 R0
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
196 "movaps %%xmm1, %%xmm2 \n\t" // xmm2 = I1 R1 I0 R0
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
197 "xorps %%xmm7, %%xmm2 \n\t" // xmm2 = -I1 R1 -I0 R0
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
198 "mulps %%xmm1, %%xmm0 \n\t" // xmm0 = rI rR rI rR
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
199 "shufps $177,%%xmm2, %%xmm2 \n\t" // xmm2 = R1 -I1 R0 -I0
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
200 "mulps %%xmm2, %%xmm3 \n\t" // xmm3 = Ri -Ii Ri -Ii
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
201 "addps %%xmm3, %%xmm0 \n\t" // xmm0 = result
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
202 "movaps %%xmm0, %0 \n\t"
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
203 :"+m"(z[k])
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
204 :"m"(tcos[k]), "m"(tsin[k])
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
205 );
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
206 }
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
207
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
208 /*
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
209 Mnemonics:
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
210 0 = z[k].re
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
211 1 = z[k].im
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
212 2 = z[k + 1].re
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
213 3 = z[k + 1].im
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
214 4 = z[-k - 2].re
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
215 5 = z[-k - 2].im
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
216 6 = z[-k - 1].re
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
217 7 = z[-k - 1].im
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
218 */
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
219 k = 16-n;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
220 asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1));
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
221 asm volatile(
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
222 "1: \n\t"
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
223 "movaps -16(%4,%0), %%xmm1 \n\t" // xmm1 = 4 5 6 7 = z[-2-k]
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
224 "neg %0 \n\t"
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
225 "movaps (%4,%0), %%xmm0 \n\t" // xmm0 = 0 1 2 3 = z[k]
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
226 "xorps %%xmm7, %%xmm0 \n\t" // xmm0 = -0 -1 -2 -3
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
227 "movaps %%xmm0, %%xmm2 \n\t" // xmm2 = -0 -1 -2 -3
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
228 "shufps $141,%%xmm1, %%xmm0 \n\t" // xmm0 = -1 -3 4 6
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
229 "shufps $216,%%xmm1, %%xmm2 \n\t" // xmm2 = -0 -2 5 7
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
230 "shufps $156,%%xmm0, %%xmm0 \n\t" // xmm0 = -1 6 -3 4 !
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
231 "shufps $156,%%xmm2, %%xmm2 \n\t" // xmm2 = -0 7 -2 5 !
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
232 "movaps %%xmm0, (%1,%0) \n\t" // output[2*k]
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
233 "movaps %%xmm2, (%2,%0) \n\t" // output[n2+2*k]
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
234 "neg %0 \n\t"
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
235 "shufps $27, %%xmm0, %%xmm0 \n\t" // xmm0 = 4 -3 6 -1
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
236 "xorps %%xmm7, %%xmm0 \n\t" // xmm0 = -4 3 -6 1 !
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
237 "shufps $27, %%xmm2, %%xmm2 \n\t" // xmm2 = 5 -2 7 -0 !
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
238 "movaps %%xmm0, -16(%2,%0) \n\t" // output[n2-4-2*k]
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
239 "movaps %%xmm2, -16(%3,%0) \n\t" // output[n-4-2*k]
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
240 "add $16, %0 \n\t"
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
241 "jle 1b \n\t"
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
242 :"+r"(k)
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
243 :"r"(output), "r"(output+n2), "r"(output+n), "r"(z+n8)
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
244 :"memory"
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
245 );
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
246 }
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
247