annotate i386/fft_sse.c @ 6693:6f13852a9161 libavcodec

Skip blocks in B-frames reuse motion vectors from next reference frame. So if referenced blocks is 16x8, 8x16 or 8x8 partitions, skip block will have them too.
author kostya
date Sat, 26 Apr 2008 13:09:36 +0000
parents 524faa5eabd1
children 33896780c612
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
1 /*
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
2 * FFT/MDCT transform with SSE optimizations
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
3 * Copyright (c) 2002 Fabrice Bellard.
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
4 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
5 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
6 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
7 * FFmpeg is free software; you can redistribute it and/or
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
8 * modify it under the terms of the GNU Lesser General Public
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
9 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
10 * version 2.1 of the License, or (at your option) any later version.
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
11 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
12 * FFmpeg is distributed in the hope that it will be useful,
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
15 * Lesser General Public License for more details.
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
16 *
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
17 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3746
diff changeset
18 * License along with FFmpeg; if not, write to the Free Software
3036
0b546eab515d Update licensing information: The FSF changed postal address.
diego
parents: 2979
diff changeset
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
20 */
5010
d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header
diego
parents: 5000
diff changeset
21 #include "dsputil.h"
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
22
3166
ab1273ffe275 use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents: 3036
diff changeset
23 static const int p1p1p1m1[4] __attribute__((aligned(16))) =
ab1273ffe275 use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents: 3036
diff changeset
24 { 0, 0, 0, 1 << 31 };
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
25
3166
ab1273ffe275 use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents: 3036
diff changeset
26 static const int p1p1m1p1[4] __attribute__((aligned(16))) =
ab1273ffe275 use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents: 3036
diff changeset
27 { 0, 0, 1 << 31, 0 };
968
64f1a11b5f86 added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents: 781
diff changeset
28
3166
ab1273ffe275 use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents: 3036
diff changeset
29 static const int p1p1m1m1[4] __attribute__((aligned(16))) =
ab1273ffe275 use xorps instead of mulps to toggle the sign of a float, as suggested by Software Optimization Guide for AMD64 Processors.
gpoirier
parents: 3036
diff changeset
30 { 0, 0, 1 << 31, 1 << 31 };
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
31
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
32 static const int p1m1p1m1[4] __attribute__((aligned(16))) =
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
33 { 0, 1 << 31, 0, 1 << 31 };
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
34
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
35 static const int m1m1m1m1[4] __attribute__((aligned(16))) =
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
36 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
37
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
38 #if 0
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
39 static void print_v4sf(const char *str, __m128 a)
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
40 {
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
41 float *p = (float *)&a;
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
42 printf("%s: %f %f %f %f\n",
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
43 str, p[0], p[1], p[2], p[3]);
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
44 }
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
45 #endif
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
46
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
47 /* XXX: handle reverse case */
1879
dd63cb7e5080 fft_*() renamed into ff_fft_*() patch by (Gildas Bazin <gbazin at altern dot org>)
michael
parents: 968
diff changeset
48 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
49 {
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
50 int ln = s->nbits;
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
51 long i, j;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
52 long nblocks, nloops;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
53 FFTComplex *p, *cptr;
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
54
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
55 asm volatile(
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
56 "movaps %0, %%xmm4 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
57 "movaps %1, %%xmm5 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
58 ::"m"(*p1p1m1m1),
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
59 "m"(*(s->inverse ? p1p1m1p1 : p1p1p1m1))
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
60 );
968
64f1a11b5f86 added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents: 781
diff changeset
61
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
62 i = 8 << ln;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
63 asm volatile(
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
64 "1: \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
65 "sub $32, %0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
66 /* do the pass 0 butterfly */
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
67 "movaps (%0,%1), %%xmm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
68 "movaps %%xmm0, %%xmm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
69 "shufps $0x4E, %%xmm0, %%xmm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
70 "xorps %%xmm4, %%xmm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
71 "addps %%xmm1, %%xmm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
72 "movaps 16(%0,%1), %%xmm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
73 "movaps %%xmm2, %%xmm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
74 "shufps $0x4E, %%xmm2, %%xmm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
75 "xorps %%xmm4, %%xmm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
76 "addps %%xmm3, %%xmm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
77 /* multiply third by -i */
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
78 /* by toggling the sign bit */
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
79 "shufps $0xB4, %%xmm2, %%xmm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
80 "xorps %%xmm5, %%xmm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
81 /* do the pass 1 butterfly */
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
82 "movaps %%xmm0, %%xmm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
83 "addps %%xmm2, %%xmm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
84 "subps %%xmm2, %%xmm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
85 "movaps %%xmm0, (%0,%1) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
86 "movaps %%xmm1, 16(%0,%1) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
87 "jg 1b \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
88 :"+r"(i)
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
89 :"r"(z)
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
90 );
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
91 /* pass 2 .. ln-1 */
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
92
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
93 nblocks = 1 << (ln-3);
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
94 nloops = 1 << 2;
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
95 cptr = s->exptab1;
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
96 do {
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
97 p = z;
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
98 j = nblocks;
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
99 do {
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
100 i = nloops*8;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
101 asm volatile(
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
102 "1: \n\t"
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
103 "sub $32, %0 \n\t"
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
104 "movaps (%2,%0), %%xmm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
105 "movaps (%1,%0), %%xmm0 \n\t"
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
106 "movaps 16(%2,%0), %%xmm5 \n\t"
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
107 "movaps 16(%1,%0), %%xmm4 \n\t"
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
108 "movaps %%xmm1, %%xmm2 \n\t"
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
109 "movaps %%xmm5, %%xmm6 \n\t"
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
110 "shufps $0xA0, %%xmm1, %%xmm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
111 "shufps $0xF5, %%xmm2, %%xmm2 \n\t"
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
112 "shufps $0xA0, %%xmm5, %%xmm5 \n\t"
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
113 "shufps $0xF5, %%xmm6, %%xmm6 \n\t"
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
114 "mulps (%3,%0,2), %%xmm1 \n\t" // cre*re cim*re
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
115 "mulps 16(%3,%0,2), %%xmm2 \n\t" // -cim*im cre*im
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
116 "mulps 32(%3,%0,2), %%xmm5 \n\t" // cre*re cim*re
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
117 "mulps 48(%3,%0,2), %%xmm6 \n\t" // -cim*im cre*im
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
118 "addps %%xmm2, %%xmm1 \n\t"
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
119 "addps %%xmm6, %%xmm5 \n\t"
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
120 "movaps %%xmm0, %%xmm3 \n\t"
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
121 "movaps %%xmm4, %%xmm7 \n\t"
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
122 "addps %%xmm1, %%xmm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
123 "subps %%xmm1, %%xmm3 \n\t"
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
124 "addps %%xmm5, %%xmm4 \n\t"
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
125 "subps %%xmm5, %%xmm7 \n\t"
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
126 "movaps %%xmm0, (%1,%0) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
127 "movaps %%xmm3, (%2,%0) \n\t"
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
128 "movaps %%xmm4, 16(%1,%0) \n\t"
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
129 "movaps %%xmm7, 16(%2,%0) \n\t"
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
130 "jg 1b \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
131 :"+r"(i)
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
132 :"r"(p), "r"(p + nloops), "r"(cptr)
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
133 );
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
134 p += nloops*2;
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
135 } while (--j);
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
136 cptr += nloops*2;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
137 nblocks >>= 1;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3166
diff changeset
138 nloops <<= 1;
781
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
139 } while (nblocks != 0);
6f5e87957bcb new generic FFT/MDCT code for audio codecs
bellard
parents:
diff changeset
140 }
968
64f1a11b5f86 added define for builtins use - inverse fix by Romain Dolbeau
bellard
parents: 781
diff changeset
141
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
142 void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
143 const FFTSample *input, FFTSample *tmp)
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
144 {
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
145 long k, n8, n4, n2, n;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
146 const uint16_t *revtab = s->fft.revtab;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
147 const FFTSample *tcos = s->tcos;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
148 const FFTSample *tsin = s->tsin;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
149 const FFTSample *in1, *in2;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
150 FFTComplex *z = (FFTComplex *)tmp;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
151
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
152 n = 1 << s->nbits;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
153 n2 = n >> 1;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
154 n4 = n >> 2;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
155 n8 = n >> 3;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
156
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
157 #ifdef ARCH_X86_64
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
158 asm volatile ("movaps %0, %%xmm8\n\t"::"m"(*p1m1p1m1));
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
159 #define P1M1P1M1 "%%xmm8"
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
160 #else
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
161 #define P1M1P1M1 "%4"
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
162 #endif
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
163
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
164 /* pre rotation */
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
165 in1 = input;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
166 in2 = input + n2 - 4;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
167
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
168 /* Complex multiplication */
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
169 for (k = 0; k < n4; k += 4) {
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
170 asm volatile (
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
171 "movaps %0, %%xmm0 \n\t" // xmm0 = r0 X r1 X : in2
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
172 "movaps %1, %%xmm3 \n\t" // xmm3 = X i1 X i0: in1
5117
524faa5eabd1 work around issues with the old version of Gnu Assembler shipped on
gpoirier
parents: 5010
diff changeset
173 "movaps -16+1*%0, %%xmm4 \n\t" // xmm4 = r0 X r1 X : in2
524faa5eabd1 work around issues with the old version of Gnu Assembler shipped on
gpoirier
parents: 5010
diff changeset
174 "movaps 16+1*%1, %%xmm7 \n\t" // xmm7 = X i1 X i0: in1
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
175 "movlps %2, %%xmm1 \n\t" // xmm1 = X X R1 R0: tcos
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
176 "movlps %3, %%xmm2 \n\t" // xmm2 = X X I1 I0: tsin
5117
524faa5eabd1 work around issues with the old version of Gnu Assembler shipped on
gpoirier
parents: 5010
diff changeset
177 "movlps 8+1*%2, %%xmm5 \n\t" // xmm5 = X X R1 R0: tcos
524faa5eabd1 work around issues with the old version of Gnu Assembler shipped on
gpoirier
parents: 5010
diff changeset
178 "movlps 8+1*%3, %%xmm6 \n\t" // xmm6 = X X I1 I0: tsin
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
179 "shufps $95, %%xmm0, %%xmm0 \n\t" // xmm0 = r1 r1 r0 r0
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
180 "shufps $160,%%xmm3, %%xmm3 \n\t" // xmm3 = i1 i1 i0 i0
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
181 "shufps $95, %%xmm4, %%xmm4 \n\t" // xmm4 = r1 r1 r0 r0
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
182 "shufps $160,%%xmm7, %%xmm7 \n\t" // xmm7 = i1 i1 i0 i0
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
183 "unpcklps %%xmm2, %%xmm1 \n\t" // xmm1 = I1 R1 I0 R0
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
184 "unpcklps %%xmm6, %%xmm5 \n\t" // xmm5 = I1 R1 I0 R0
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
185 "movaps %%xmm1, %%xmm2 \n\t" // xmm2 = I1 R1 I0 R0
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
186 "movaps %%xmm5, %%xmm6 \n\t" // xmm6 = I1 R1 I0 R0
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
187 "xorps "P1M1P1M1", %%xmm2 \n\t" // xmm2 = -I1 R1 -I0 R0
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
188 "xorps "P1M1P1M1", %%xmm6 \n\t" // xmm6 = -I1 R1 -I0 R0
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
189 "mulps %%xmm1, %%xmm0 \n\t" // xmm0 = rI rR rI rR
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
190 "mulps %%xmm5, %%xmm4 \n\t" // xmm4 = rI rR rI rR
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
191 "shufps $177,%%xmm2, %%xmm2 \n\t" // xmm2 = R1 -I1 R0 -I0
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
192 "shufps $177,%%xmm6, %%xmm6 \n\t" // xmm6 = R1 -I1 R0 -I0
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
193 "mulps %%xmm2, %%xmm3 \n\t" // xmm3 = Ri -Ii Ri -Ii
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
194 "mulps %%xmm6, %%xmm7 \n\t" // xmm7 = Ri -Ii Ri -Ii
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
195 "addps %%xmm3, %%xmm0 \n\t" // xmm0 = result
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
196 "addps %%xmm7, %%xmm4 \n\t" // xmm4 = result
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
197 ::"m"(in2[-2*k]), "m"(in1[2*k]),
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
198 "m"(tcos[k]), "m"(tsin[k])
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
199 #ifndef ARCH_X86_64
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
200 ,"m"(*p1m1p1m1)
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
201 #endif
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
202 );
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
203 /* Should be in the same block, hack for gcc2.95 & gcc3 */
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
204 asm (
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
205 "movlps %%xmm0, %0 \n\t"
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
206 "movhps %%xmm0, %1 \n\t"
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
207 "movlps %%xmm4, %2 \n\t"
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
208 "movhps %%xmm4, %3 \n\t"
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
209 :"=m"(z[revtab[k]]), "=m"(z[revtab[k + 1]]),
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
210 "=m"(z[revtab[k + 2]]), "=m"(z[revtab[k + 3]])
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
211 );
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
212 }
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
213
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
214 ff_fft_calc_sse(&s->fft, z);
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
215
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
216 #ifndef ARCH_X86_64
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
217 #undef P1M1P1M1
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
218 #define P1M1P1M1 "%3"
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
219 #endif
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
220
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
221 /* post rotation + reordering */
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
222 for (k = 0; k < n4; k += 4) {
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
223 asm (
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
224 "movaps %0, %%xmm0 \n\t" // xmm0 = i1 r1 i0 r0: z
5117
524faa5eabd1 work around issues with the old version of Gnu Assembler shipped on
gpoirier
parents: 5010
diff changeset
225 "movaps 16+1*%0, %%xmm4 \n\t" // xmm4 = i1 r1 i0 r0: z
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
226 "movlps %1, %%xmm1 \n\t" // xmm1 = X X R1 R0: tcos
5117
524faa5eabd1 work around issues with the old version of Gnu Assembler shipped on
gpoirier
parents: 5010
diff changeset
227 "movlps 8+1*%1, %%xmm5 \n\t" // xmm5 = X X R1 R0: tcos
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
228 "movaps %%xmm0, %%xmm3 \n\t" // xmm3 = i1 r1 i0 r0
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
229 "movaps %%xmm4, %%xmm7 \n\t" // xmm7 = i1 r1 i0 r0
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
230 "movlps %2, %%xmm2 \n\t" // xmm2 = X X I1 I0: tsin
5117
524faa5eabd1 work around issues with the old version of Gnu Assembler shipped on
gpoirier
parents: 5010
diff changeset
231 "movlps 8+1*%2, %%xmm6 \n\t" // xmm6 = X X I1 I0: tsin
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
232 "shufps $160,%%xmm0, %%xmm0 \n\t" // xmm0 = r1 r1 r0 r0
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
233 "shufps $245,%%xmm3, %%xmm3 \n\t" // xmm3 = i1 i1 i0 i0
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
234 "shufps $160,%%xmm4, %%xmm4 \n\t" // xmm4 = r1 r1 r0 r0
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
235 "shufps $245,%%xmm7, %%xmm7 \n\t" // xmm7 = i1 i1 i0 i0
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
236 "unpcklps %%xmm2, %%xmm1 \n\t" // xmm1 = I1 R1 I0 R0
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
237 "unpcklps %%xmm6, %%xmm5 \n\t" // xmm5 = I1 R1 I0 R0
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
238 "movaps %%xmm1, %%xmm2 \n\t" // xmm2 = I1 R1 I0 R0
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
239 "movaps %%xmm5, %%xmm6 \n\t" // xmm6 = I1 R1 I0 R0
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
240 "xorps "P1M1P1M1", %%xmm2 \n\t" // xmm2 = -I1 R1 -I0 R0
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
241 "mulps %%xmm1, %%xmm0 \n\t" // xmm0 = rI rR rI rR
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
242 "xorps "P1M1P1M1", %%xmm6 \n\t" // xmm6 = -I1 R1 -I0 R0
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
243 "mulps %%xmm5, %%xmm4 \n\t" // xmm4 = rI rR rI rR
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
244 "shufps $177,%%xmm2, %%xmm2 \n\t" // xmm2 = R1 -I1 R0 -I0
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
245 "shufps $177,%%xmm6, %%xmm6 \n\t" // xmm6 = R1 -I1 R0 -I0
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
246 "mulps %%xmm2, %%xmm3 \n\t" // xmm3 = Ri -Ii Ri -Ii
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
247 "mulps %%xmm6, %%xmm7 \n\t" // xmm7 = Ri -Ii Ri -Ii
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
248 "addps %%xmm3, %%xmm0 \n\t" // xmm0 = result
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
249 "addps %%xmm7, %%xmm4 \n\t" // xmm4 = result
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
250 "movaps %%xmm0, %0 \n\t"
5117
524faa5eabd1 work around issues with the old version of Gnu Assembler shipped on
gpoirier
parents: 5010
diff changeset
251 "movaps %%xmm4, 16+1*%0\n\t"
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
252 :"+m"(z[k])
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
253 :"m"(tcos[k]), "m"(tsin[k])
5000
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
254 #ifndef ARCH_X86_64
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
255 ,"m"(*p1m1p1m1)
743a8b12b7de Faster SSE FFT/MDCT, patch by Zuxy Meng %zuxy P meng A gmail P com%
gpoirier
parents: 3947
diff changeset
256 #endif
3746
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
257 );
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
258 }
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
259
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
260 /*
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
261 Mnemonics:
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
262 0 = z[k].re
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
263 1 = z[k].im
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
264 2 = z[k + 1].re
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
265 3 = z[k + 1].im
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
266 4 = z[-k - 2].re
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
267 5 = z[-k - 2].im
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
268 6 = z[-k - 1].re
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
269 7 = z[-k - 1].im
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
270 */
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
271 k = 16-n;
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
272 asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1));
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
273 asm volatile(
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
274 "1: \n\t"
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
275 "movaps -16(%4,%0), %%xmm1 \n\t" // xmm1 = 4 5 6 7 = z[-2-k]
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
276 "neg %0 \n\t"
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
277 "movaps (%4,%0), %%xmm0 \n\t" // xmm0 = 0 1 2 3 = z[k]
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
278 "xorps %%xmm7, %%xmm0 \n\t" // xmm0 = -0 -1 -2 -3
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
279 "movaps %%xmm0, %%xmm2 \n\t" // xmm2 = -0 -1 -2 -3
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
280 "shufps $141,%%xmm1, %%xmm0 \n\t" // xmm0 = -1 -3 4 6
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
281 "shufps $216,%%xmm1, %%xmm2 \n\t" // xmm2 = -0 -2 5 7
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
282 "shufps $156,%%xmm0, %%xmm0 \n\t" // xmm0 = -1 6 -3 4 !
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
283 "shufps $156,%%xmm2, %%xmm2 \n\t" // xmm2 = -0 7 -2 5 !
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
284 "movaps %%xmm0, (%1,%0) \n\t" // output[2*k]
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
285 "movaps %%xmm2, (%2,%0) \n\t" // output[n2+2*k]
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
286 "neg %0 \n\t"
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
287 "shufps $27, %%xmm0, %%xmm0 \n\t" // xmm0 = 4 -3 6 -1
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
288 "xorps %%xmm7, %%xmm0 \n\t" // xmm0 = -4 3 -6 1 !
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
289 "shufps $27, %%xmm2, %%xmm2 \n\t" // xmm2 = 5 -2 7 -0 !
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
290 "movaps %%xmm0, -16(%2,%0) \n\t" // output[n2-4-2*k]
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
291 "movaps %%xmm2, -16(%3,%0) \n\t" // output[n-4-2*k]
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
292 "add $16, %0 \n\t"
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
293 "jle 1b \n\t"
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
294 :"+r"(k)
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
295 :"r"(output), "r"(output+n2), "r"(output+n), "r"(z+n8)
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
296 :"memory"
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
297 );
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
298 }
2ec498208c6a sse implementation of imdct.
lorenm
parents: 3590
diff changeset
299