annotate liba52/imdct.c @ 8763:19e96e60a3d0

Speed optimizations (runs twise as fast) and bugfix (wrong cutoff frequency buffer over run noise and garbeled output when wrong input format)
author anders
date Sat, 04 Jan 2003 06:19:25 +0000
parents fb88ccbc5ccc
children 01a9cf43074c
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1 /*
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
2 * imdct.c
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
3 * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
5 *
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
6 * This file is part of a52dec, a free ATSC A-52 stream decoder.
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
7 * See http://liba52.sourceforge.net/ for updates.
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
8 *
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
9 * a52dec is free software; you can redistribute it and/or modify
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
10 * it under the terms of the GNU General Public License as published by
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
11 * the Free Software Foundation; either version 2 of the License, or
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
12 * (at your option) any later version.
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
13 *
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
14 * a52dec is distributed in the hope that it will be useful,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
17 * GNU General Public License for more details.
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
18 *
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
19 * You should have received a copy of the GNU General Public License
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
20 * along with this program; if not, write to the Free Software
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
22 *
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
23 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
24 * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
25 * michael did port them from libac3 (untested, perhaps totally broken)
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
26 */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
27
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
28 #include "config.h"
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
29
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
30 #include <math.h>
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
31 #include <stdio.h>
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
32 #ifndef M_PI
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
33 #define M_PI 3.1415926535897932384626433832795029
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
34 #endif
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
35 #include <inttypes.h>
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
36
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
37 #include "a52.h"
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
38 #include "a52_internal.h"
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
39 #include "mm_accel.h"
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
40 #include "mangle.h"
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
41
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
42 #ifdef RUNTIME_CPUDETECT
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
43 #undef HAVE_3DNOWEX
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
44 #endif
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
45
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
46 #define USE_AC3_C
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
47
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
48 void (* imdct_256) (sample_t data[], sample_t delay[], sample_t bias);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
49 void (* imdct_512) (sample_t data[], sample_t delay[], sample_t bias);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
50
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
51 typedef struct complex_s {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
52 sample_t real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
53 sample_t imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
54 } complex_t;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
55
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
56 static void fft_128p(complex_t *a);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
57
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
58 static const int pm128[128] __attribute__((aligned(16))) =
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
59 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
60 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
61 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
62 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
63 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
64 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
65 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
66 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
67 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
68 };
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
69
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
70 /* 128 point bit-reverse LUT */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
71 static uint8_t bit_reverse_512[] = {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
72 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
73 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
74 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
75 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
76 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
77 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
78 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
79 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
80 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
81 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
82 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
83 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
84 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
85 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
86 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
87 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
88
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
89 static uint8_t bit_reverse_256[] = {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
90 0x00, 0x20, 0x10, 0x30, 0x08, 0x28, 0x18, 0x38,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
91 0x04, 0x24, 0x14, 0x34, 0x0c, 0x2c, 0x1c, 0x3c,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
92 0x02, 0x22, 0x12, 0x32, 0x0a, 0x2a, 0x1a, 0x3a,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
93 0x06, 0x26, 0x16, 0x36, 0x0e, 0x2e, 0x1e, 0x3e,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
94 0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
95 0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
96 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
97 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
98
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
99 #ifdef ARCH_X86
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
100 // NOTE: SSE needs 16byte alignment or it will segfault
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
101 //
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
102 static complex_t __attribute__((aligned(16))) buf[128];
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
103 static float __attribute__((aligned(16))) sseSinCos1c[256];
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
104 static float __attribute__((aligned(16))) sseSinCos1d[256];
3512
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
105 static float __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
106 //static float __attribute__((aligned(16))) sseW0[4];
3483390a902b sse opt
michael
parents: 3529
diff changeset
107 static float __attribute__((aligned(16))) sseW1[8];
3483390a902b sse opt
michael
parents: 3529
diff changeset
108 static float __attribute__((aligned(16))) sseW2[16];
3483390a902b sse opt
michael
parents: 3529
diff changeset
109 static float __attribute__((aligned(16))) sseW3[32];
3483390a902b sse opt
michael
parents: 3529
diff changeset
110 static float __attribute__((aligned(16))) sseW4[64];
3483390a902b sse opt
michael
parents: 3529
diff changeset
111 static float __attribute__((aligned(16))) sseW5[128];
3483390a902b sse opt
michael
parents: 3529
diff changeset
112 static float __attribute__((aligned(16))) sseW6[256];
3483390a902b sse opt
michael
parents: 3529
diff changeset
113 static float __attribute__((aligned(16))) *sseW[7]=
3483390a902b sse opt
michael
parents: 3529
diff changeset
114 {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
115 static float __attribute__((aligned(16))) sseWindow[512];
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
116 #else
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
117 static complex_t buf[128];
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
118 #endif
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
119
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
120 /* Twiddle factor LUT */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
121 static complex_t w_1[1];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
122 static complex_t w_2[2];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
123 static complex_t w_4[4];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
124 static complex_t w_8[8];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
125 static complex_t w_16[16];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
126 static complex_t w_32[32];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
127 static complex_t w_64[64];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
128 static complex_t * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
129
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
130 /* Twiddle factors for IMDCT */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
131 static sample_t xcos1[128];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
132 static sample_t xsin1[128];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
133 static sample_t xcos2[64];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
134 static sample_t xsin2[64];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
135
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
136 /* Windowing function for Modified DCT - Thank you acroread */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
137 sample_t imdct_window[] = {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
138 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
139 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
140 0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
141 0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
142 0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
143 0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
144 0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
145 0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
146 0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
147 0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
148 0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
149 0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
150 0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
151 0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
152 0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
153 0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
154 0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
155 0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
156 0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
157 0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
158 0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
159 0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
160 0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
161 0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
162 0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
163 0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
164 0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
165 0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
166 0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
167 0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
168 0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
169 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 };
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
170
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
171
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
172 static inline void swap_cmplx(complex_t *a, complex_t *b)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
173 {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
174 complex_t tmp;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
175
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
176 tmp = *a;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
177 *a = *b;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
178 *b = tmp;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
179 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
180
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
181
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
182
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
183 static inline complex_t cmplx_mult(complex_t a, complex_t b)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
184 {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
185 complex_t ret;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
186
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
187 ret.real = a.real * b.real - a.imag * b.imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
188 ret.imag = a.real * b.imag + a.imag * b.real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
189
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
190 return ret;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
191 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
192
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
193 void
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
194 imdct_do_512(sample_t data[],sample_t delay[], sample_t bias)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
195 {
8254
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
196 int i;
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
197 #ifndef USE_AC3_C
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
198 int k;
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
199 int p,q;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
200 int m;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
201 int two_m;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
202 int two_m_plus_one;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
203
8254
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
204 sample_t tmp_b_i;
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
205 sample_t tmp_b_r;
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
206 #endif
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
207 sample_t tmp_a_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
208 sample_t tmp_a_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
209
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
210 sample_t *data_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
211 sample_t *delay_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
212 sample_t *window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
213
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
214 /* 512 IMDCT with source and dest data in 'data' */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
215
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
216 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
217 for( i=0; i < 128; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
218 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
219 #ifdef USE_AC3_C
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
220 int j= pm128[i];
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
221 #else
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
222 int j= bit_reverse_512[i];
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
223 #endif
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
224 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
225 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
226 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
227
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
228 /* FFT Merge */
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
229 /* unoptimized variant
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
230 for (m=1; m < 7; m++) {
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
231 if(m)
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
232 two_m = (1 << m);
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
233 else
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
234 two_m = 1;
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
235
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
236 two_m_plus_one = (1 << (m+1));
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
237
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
238 for(i = 0; i < 128; i += two_m_plus_one) {
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
239 for(k = 0; k < two_m; k++) {
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
240 p = k + i;
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
241 q = p + two_m;
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
242 tmp_a_r = buf[p].real;
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
243 tmp_a_i = buf[p].imag;
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
244 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
245 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
246 buf[p].real = tmp_a_r + tmp_b_r;
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
247 buf[p].imag = tmp_a_i + tmp_b_i;
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
248 buf[q].real = tmp_a_r - tmp_b_r;
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
249 buf[q].imag = tmp_a_i - tmp_b_i;
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
250 }
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
251 }
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
252 }
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
253 */
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
254 #ifdef USE_AC3_C
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
255 fft_128p (&buf[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
256 #else
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
257
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
258 /* 1. iteration */
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
259 for(i = 0; i < 128; i += 2) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
260 tmp_a_r = buf[i].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
261 tmp_a_i = buf[i].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
262 tmp_b_r = buf[i+1].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
263 tmp_b_i = buf[i+1].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
264 buf[i].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
265 buf[i].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
266 buf[i+1].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
267 buf[i+1].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
268 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
269
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
270 /* 2. iteration */
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
271 // Note w[1]={{1,0}, {0,-1}}
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
272 for(i = 0; i < 128; i += 4) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
273 tmp_a_r = buf[i].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
274 tmp_a_i = buf[i].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
275 tmp_b_r = buf[i+2].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
276 tmp_b_i = buf[i+2].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
277 buf[i].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
278 buf[i].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
279 buf[i+2].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
280 buf[i+2].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
281 tmp_a_r = buf[i+1].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
282 tmp_a_i = buf[i+1].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
283 tmp_b_r = buf[i+3].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
284 tmp_b_i = buf[i+3].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
285 buf[i+1].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
286 buf[i+1].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
287 buf[i+3].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
288 buf[i+3].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
289 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
290
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
291 /* 3. iteration */
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
292 for(i = 0; i < 128; i += 8) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
293 tmp_a_r = buf[i].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
294 tmp_a_i = buf[i].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
295 tmp_b_r = buf[i+4].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
296 tmp_b_i = buf[i+4].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
297 buf[i].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
298 buf[i].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
299 buf[i+4].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
300 buf[i+4].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
301 tmp_a_r = buf[1+i].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
302 tmp_a_i = buf[1+i].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
303 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
304 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
305 buf[1+i].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
306 buf[1+i].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
307 buf[i+5].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
308 buf[i+5].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
309 tmp_a_r = buf[i+2].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
310 tmp_a_i = buf[i+2].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
311 tmp_b_r = buf[i+6].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
312 tmp_b_i = - buf[i+6].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
313 buf[i+2].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
314 buf[i+2].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
315 buf[i+6].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
316 buf[i+6].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
317 tmp_a_r = buf[i+3].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
318 tmp_a_i = buf[i+3].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
319 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
320 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
321 buf[i+3].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
322 buf[i+3].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
323 buf[i+7].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
324 buf[i+7].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
325 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
326
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
327 /* 4-7. iterations */
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
328 for (m=3; m < 7; m++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
329 two_m = (1 << m);
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
330
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
331 two_m_plus_one = two_m<<1;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
332
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
333 for(i = 0; i < 128; i += two_m_plus_one) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
334 for(k = 0; k < two_m; k++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
335 int p = k + i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
336 int q = p + two_m;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
337 tmp_a_r = buf[p].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
338 tmp_a_i = buf[p].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
339 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
340 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
341 buf[p].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
342 buf[p].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
343 buf[q].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
344 buf[q].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
345 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
346 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
347 }
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
348 #endif
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
349 /* Post IFFT complex multiply plus IFFT complex conjugate*/
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
350 for( i=0; i < 128; i++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
351 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
352 tmp_a_r = buf[i].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
353 tmp_a_i = -1.0 * buf[i].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
354 buf[i].real =(tmp_a_r * xcos1[i]) - (tmp_a_i * xsin1[i]);
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
355 buf[i].imag =(tmp_a_r * xsin1[i]) + (tmp_a_i * xcos1[i]);
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
356 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
357
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
358 data_ptr = data;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
359 delay_ptr = delay;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
360 window_ptr = imdct_window;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
361
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
362 /* Window and convert to real valued signal */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
363 for(i=0; i< 64; i++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
364 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
365 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
366 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
367
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
368 for(i=0; i< 64; i++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
369 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
370 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
371 }
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
372
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
373 /* The trailing edge of the window goes into the delay line */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
374 delay_ptr = delay;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
375
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
376 for(i=0; i< 64; i++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
377 *delay_ptr++ = -buf[64+i].real * *--window_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
378 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
379 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
380
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
381 for(i=0; i<64; i++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
382 *delay_ptr++ = buf[i].imag * *--window_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
383 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
384 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
385 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
386
4497
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
387 // Stuff below this line is borrowed from libac3
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
388 #include "srfftp.h"
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
389 #ifdef ARCH_X86
4497
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
390 #ifndef HAVE_3DNOW
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
391 #define HAVE_3DNOW 1
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
392 #endif
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
393 #include "srfftp_3dnow.h"
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
394
8451
fb88ccbc5ccc compiler warning fixes
arpi
parents: 8254
diff changeset
395 const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
fb88ccbc5ccc compiler warning fixes
arpi
parents: 8254
diff changeset
396 const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
397 const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
398
4497
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
399 #undef HAVE_3DNOWEX
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
400 #include "imdct_3dnow.h"
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
401 #define HAVE_3DNOWEX
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
402 #include "imdct_3dnow.h"
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
403
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
404 void
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
405 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
406 {
8254
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
407 /* int i,k;
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
408 int p,q;*/
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
409 int m;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
410 int two_m;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
411 int two_m_plus_one;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
412
8254
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
413 /* sample_t tmp_a_i;
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
414 sample_t tmp_a_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
415 sample_t tmp_b_i;
8254
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
416 sample_t tmp_b_r;*/
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
417
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
418 sample_t *data_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
419 sample_t *delay_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
420 sample_t *window_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
421
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
422 /* 512 IMDCT with source and dest data in 'data' */
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
423 /* see the c version (dct_do_512()), its allmost identical, just in C */
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
424
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
425 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
426 /* Bit reversed shuffling */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
427 asm volatile(
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
428 "xorl %%esi, %%esi \n\t"
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
429 "leal "MANGLE(bit_reverse_512)", %%eax \n\t"
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
430 "movl $1008, %%edi \n\t"
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
431 "pushl %%ebp \n\t" //use ebp without telling gcc
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
432 ".balign 16 \n\t"
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
433 "1: \n\t"
3584
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
434 "movlps (%0, %%esi), %%xmm0 \n\t" // XXXI
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
435 "movhps 8(%0, %%edi), %%xmm0 \n\t" // RXXI
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
436 "movlps 8(%0, %%esi), %%xmm1 \n\t" // XXXi
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
437 "movhps (%0, %%edi), %%xmm1 \n\t" // rXXi
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
438 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
439 "movaps "MANGLE(sseSinCos1c)"(%%esi), %%xmm2\n\t"
3584
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
440 "mulps %%xmm0, %%xmm2 \n\t"
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
441 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
442 "mulps "MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"
3584
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
443 "subps %%xmm0, %%xmm2 \n\t"
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
444 "movzbl (%%eax), %%edx \n\t"
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
445 "movzbl 1(%%eax), %%ebp \n\t"
3584
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
446 "movlps %%xmm2, (%1, %%edx,8) \n\t"
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
447 "movhps %%xmm2, (%1, %%ebp,8) \n\t"
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
448 "addl $16, %%esi \n\t"
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
449 "addl $2, %%eax \n\t" // avoid complex addressing for P4 crap
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
450 "subl $16, %%edi \n\t"
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
451 " jnc 1b \n\t"
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
452 "popl %%ebp \n\t"//no we didnt touch ebp *g*
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
453 :: "b" (data), "c" (buf)
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
454 : "%esi", "%edi", "%eax", "%edx"
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
455 );
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
456
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
457
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
458 /* FFT Merge */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
459 /* unoptimized variant
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
460 for (m=1; m < 7; m++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
461 if(m)
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
462 two_m = (1 << m);
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
463 else
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
464 two_m = 1;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
465
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
466 two_m_plus_one = (1 << (m+1));
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
467
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
468 for(i = 0; i < 128; i += two_m_plus_one) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
469 for(k = 0; k < two_m; k++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
470 p = k + i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
471 q = p + two_m;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
472 tmp_a_r = buf[p].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
473 tmp_a_i = buf[p].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
474 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
475 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
476 buf[p].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
477 buf[p].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
478 buf[q].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
479 buf[q].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
480 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
481 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
482 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
483 */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
484
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
485 /* 1. iteration */
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
486 // Note w[0][0]={1,0}
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
487 asm volatile(
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
488 "xorps %%xmm1, %%xmm1 \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
489 "xorps %%xmm2, %%xmm2 \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
490 "movl %0, %%esi \n\t"
3529
a86166b495a6 sse opt
michael
parents: 3527
diff changeset
491 ".balign 16 \n\t"
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
492 "1: \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
493 "movlps (%%esi), %%xmm0 \n\t" //buf[p]
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
494 "movlps 8(%%esi), %%xmm1\n\t" //buf[q]
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
495 "movhps (%%esi), %%xmm0 \n\t" //buf[p]
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
496 "movhps 8(%%esi), %%xmm2\n\t" //buf[q]
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
497 "addps %%xmm1, %%xmm0 \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
498 "subps %%xmm2, %%xmm0 \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
499 "movaps %%xmm0, (%%esi) \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
500 "addl $16, %%esi \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
501 "cmpl %1, %%esi \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
502 " jb 1b \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
503 :: "g" (buf), "r" (buf + 128)
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
504 : "%esi"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
505 );
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
506
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
507 /* 2. iteration */
3512
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
508 // Note w[1]={{1,0}, {0,-1}}
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
509 asm volatile(
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
510 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
3512
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
511 "movl %0, %%esi \n\t"
3529
a86166b495a6 sse opt
michael
parents: 3527
diff changeset
512 ".balign 16 \n\t"
3512
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
513 "1: \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
514 "movaps 16(%%esi), %%xmm2 \n\t" //r2,i2,r3,i3
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
515 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
516 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
517 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
518 "movaps (%%esi), %%xmm1 \n\t" //r0,i0,r1,i1
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
519 "addps %%xmm2, %%xmm0 \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
520 "subps %%xmm2, %%xmm1 \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
521 "movaps %%xmm0, (%%esi) \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
522 "movaps %%xmm1, 16(%%esi) \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
523 "addl $32, %%esi \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
524 "cmpl %1, %%esi \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
525 " jb 1b \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
526 :: "g" (buf), "r" (buf + 128)
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
527 : "%esi"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
528 );
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
529
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
530 /* 3. iteration */
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
531 /*
3483390a902b sse opt
michael
parents: 3529
diff changeset
532 Note sseW2+0={1,1,sqrt(2),sqrt(2))
3483390a902b sse opt
michael
parents: 3529
diff changeset
533 Note sseW2+16={0,0,sqrt(2),-sqrt(2))
3483390a902b sse opt
michael
parents: 3529
diff changeset
534 Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
3483390a902b sse opt
michael
parents: 3529
diff changeset
535 Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
3483390a902b sse opt
michael
parents: 3529
diff changeset
536 */
3483390a902b sse opt
michael
parents: 3529
diff changeset
537 asm volatile(
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
538 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
539 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
540 "xorps %%xmm5, %%xmm5 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
541 "xorps %%xmm2, %%xmm2 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
542 "movl %0, %%esi \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
543 ".balign 16 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
544 "1: \n\t"
3537
d7e5a32643c9 C optimizations
michael
parents: 3534
diff changeset
545 "movaps 32(%%esi), %%xmm2 \n\t" //r4,i4,r5,i5
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
546 "movaps 48(%%esi), %%xmm3 \n\t" //r6,i6,r7,i7
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
547 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
548 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
3537
d7e5a32643c9 C optimizations
michael
parents: 3534
diff changeset
549 "mulps %%xmm2, %%xmm4 \n\t"
d7e5a32643c9 C optimizations
michael
parents: 3534
diff changeset
550 "mulps %%xmm3, %%xmm5 \n\t"
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
551 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5
3483390a902b sse opt
michael
parents: 3529
diff changeset
552 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7
3537
d7e5a32643c9 C optimizations
michael
parents: 3534
diff changeset
553 "mulps %%xmm6, %%xmm3 \n\t"
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
554 "mulps %%xmm7, %%xmm2 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
555 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1
3483390a902b sse opt
michael
parents: 3529
diff changeset
556 "movaps 16(%%esi), %%xmm1 \n\t" //r2,i2,r3,i3
3483390a902b sse opt
michael
parents: 3529
diff changeset
557 "addps %%xmm4, %%xmm2 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
558 "addps %%xmm5, %%xmm3 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
559 "movaps %%xmm2, %%xmm4 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
560 "movaps %%xmm3, %%xmm5 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
561 "addps %%xmm0, %%xmm2 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
562 "addps %%xmm1, %%xmm3 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
563 "subps %%xmm4, %%xmm0 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
564 "subps %%xmm5, %%xmm1 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
565 "movaps %%xmm2, (%%esi) \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
566 "movaps %%xmm3, 16(%%esi) \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
567 "movaps %%xmm0, 32(%%esi) \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
568 "movaps %%xmm1, 48(%%esi) \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
569 "addl $64, %%esi \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
570 "cmpl %1, %%esi \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
571 " jb 1b \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
572 :: "g" (buf), "r" (buf + 128)
3483390a902b sse opt
michael
parents: 3529
diff changeset
573 : "%esi"
3483390a902b sse opt
michael
parents: 3529
diff changeset
574 );
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
575
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
576 /* 4-7. iterations */
3546
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
577 for (m=3; m < 7; m++) {
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
578 two_m = (1 << m);
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
579 two_m_plus_one = two_m<<1;
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
580 asm volatile(
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
581 "movl %0, %%esi \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
582 ".balign 16 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
583 "1: \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
584 "xorl %%edi, %%edi \n\t" // k
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
585 "leal (%%esi, %3), %%edx \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
586 "2: \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
587 "movaps (%%edx, %%edi), %%xmm1 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
588 "movaps (%4, %%edi, 2), %%xmm2 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
589 "mulps %%xmm1, %%xmm2 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
590 "shufps $0xB1, %%xmm1, %%xmm1 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
591 "mulps 16(%4, %%edi, 2), %%xmm1 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
592 "movaps (%%esi, %%edi), %%xmm0 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
593 "addps %%xmm2, %%xmm1 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
594 "movaps %%xmm1, %%xmm2 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
595 "addps %%xmm0, %%xmm1 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
596 "subps %%xmm2, %%xmm0 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
597 "movaps %%xmm1, (%%esi, %%edi) \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
598 "movaps %%xmm0, (%%edx, %%edi) \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
599 "addl $16, %%edi \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
600 "cmpl %3, %%edi \n\t" //FIXME (opt) count against 0
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
601 " jb 2b \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
602 "addl %2, %%esi \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
603 "cmpl %1, %%esi \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
604 " jb 1b \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
605 :: "g" (buf), "m" (buf+128), "m" (two_m_plus_one<<3), "r" (two_m<<3),
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
606 "r" (sseW[m])
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
607 : "%esi", "%edi", "%edx"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
608 );
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
609 }
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
610
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
611 /* Post IFFT complex multiply plus IFFT complex conjugate*/
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
612 asm volatile(
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
613 "movl $-1024, %%esi \n\t"
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
614 ".balign 16 \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
615 "1: \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
616 "movaps (%0, %%esi), %%xmm0 \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
617 "movaps (%0, %%esi), %%xmm1 \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
618 "shufps $0xB1, %%xmm0, %%xmm0 \n\t"
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
619 "mulps 1024+"MANGLE(sseSinCos1c)"(%%esi), %%xmm1\n\t"
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
620 "mulps 1024+"MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
621 "addps %%xmm1, %%xmm0 \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
622 "movaps %%xmm0, (%0, %%esi) \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
623 "addl $16, %%esi \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
624 " jnz 1b \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
625 :: "r" (buf+128)
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
626 : "%esi"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
627 );
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
628
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
629
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
630 data_ptr = data;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
631 delay_ptr = delay;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
632 window_ptr = imdct_window;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
633
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
634 /* Window and convert to real valued signal */
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
635 asm volatile(
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
636 "xorl %%edi, %%edi \n\t" // 0
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
637 "xorl %%esi, %%esi \n\t" // 0
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
638 "movss %3, %%xmm2 \n\t" // bias
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
639 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
640 ".balign 16 \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
641 "1: \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
642 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ?
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
643 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ?
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
644 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ?
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
645 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ?
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
646 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
647 "mulps "MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
648 "addps (%2, %%esi), %%xmm0 \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
649 "addps %%xmm2, %%xmm0 \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
650 "movaps %%xmm0, (%1, %%esi) \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
651 "addl $16, %%esi \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
652 "subl $16, %%edi \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
653 "cmpl $512, %%esi \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
654 " jb 1b \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
655 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
656 : "%esi", "%edi"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
657 );
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
658 data_ptr+=128;
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
659 delay_ptr+=128;
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
660 // window_ptr+=128;
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
661
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
662 asm volatile(
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
663 "movl $1024, %%edi \n\t" // 512
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
664 "xorl %%esi, %%esi \n\t" // 0
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
665 "movss %3, %%xmm2 \n\t" // bias
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
666 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
667 ".balign 16 \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
668 "1: \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
669 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
670 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
671 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
672 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
673 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
674 "mulps 512+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
675 "addps (%2, %%esi), %%xmm0 \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
676 "addps %%xmm2, %%xmm0 \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
677 "movaps %%xmm0, (%1, %%esi) \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
678 "addl $16, %%esi \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
679 "subl $16, %%edi \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
680 "cmpl $512, %%esi \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
681 " jb 1b \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
682 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
683 : "%esi", "%edi"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
684 );
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
685 data_ptr+=128;
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
686 // window_ptr+=128;
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
687
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
688 /* The trailing edge of the window goes into the delay line */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
689 delay_ptr = delay;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
690
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
691 asm volatile(
a501627fc6db sse opt
michael
parents: 3552
diff changeset
692 "xorl %%edi, %%edi \n\t" // 0
a501627fc6db sse opt
michael
parents: 3552
diff changeset
693 "xorl %%esi, %%esi \n\t" // 0
a501627fc6db sse opt
michael
parents: 3552
diff changeset
694 ".balign 16 \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
695 "1: \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
696 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A
a501627fc6db sse opt
michael
parents: 3552
diff changeset
697 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C
a501627fc6db sse opt
michael
parents: 3552
diff changeset
698 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C
a501627fc6db sse opt
michael
parents: 3552
diff changeset
699 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A
a501627fc6db sse opt
michael
parents: 3552
diff changeset
700 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
701 "mulps 1024+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
702 "movaps %%xmm0, (%1, %%esi) \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
703 "addl $16, %%esi \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
704 "subl $16, %%edi \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
705 "cmpl $512, %%esi \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
706 " jb 1b \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
707 :: "r" (buf+64), "r" (delay_ptr)
a501627fc6db sse opt
michael
parents: 3552
diff changeset
708 : "%esi", "%edi"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
709 );
a501627fc6db sse opt
michael
parents: 3552
diff changeset
710 delay_ptr+=128;
a501627fc6db sse opt
michael
parents: 3552
diff changeset
711 // window_ptr-=128;
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
712
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
713 asm volatile(
a501627fc6db sse opt
michael
parents: 3552
diff changeset
714 "movl $1024, %%edi \n\t" // 1024
a501627fc6db sse opt
michael
parents: 3552
diff changeset
715 "xorl %%esi, %%esi \n\t" // 0
a501627fc6db sse opt
michael
parents: 3552
diff changeset
716 ".balign 16 \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
717 "1: \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
718 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ?
a501627fc6db sse opt
michael
parents: 3552
diff changeset
719 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ?
a501627fc6db sse opt
michael
parents: 3552
diff changeset
720 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ?
a501627fc6db sse opt
michael
parents: 3552
diff changeset
721 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ?
a501627fc6db sse opt
michael
parents: 3552
diff changeset
722 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
723 "mulps 1536+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
724 "movaps %%xmm0, (%1, %%esi) \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
725 "addl $16, %%esi \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
726 "subl $16, %%edi \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
727 "cmpl $512, %%esi \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
728 " jb 1b \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
729 :: "r" (buf), "r" (delay_ptr)
a501627fc6db sse opt
michael
parents: 3552
diff changeset
730 : "%esi", "%edi"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
731 );
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
732 }
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
733 #endif //arch_x86
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
734
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
735 void
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
736 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
737 {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
738 int i,k;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
739 int p,q;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
740 int m;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
741 int two_m;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
742 int two_m_plus_one;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
743
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
744 sample_t tmp_a_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
745 sample_t tmp_a_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
746 sample_t tmp_b_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
747 sample_t tmp_b_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
748
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
749 sample_t *data_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
750 sample_t *delay_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
751 sample_t *window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
752
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
753 complex_t *buf_1, *buf_2;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
754
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
755 buf_1 = &buf[0];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
756 buf_2 = &buf[64];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
757
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
758 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
759 for(k=0; k<64; k++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
760 /* X1[k] = X[2*k] */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
761 /* X2[k] = X[2*k+1] */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
762
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
763 p = 2 * (128-2*k-1);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
764 q = 2 * (2 * k);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
765
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
766 /* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
767 buf_1[k].real = data[p] * xcos2[k] - data[q] * xsin2[k];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
768 buf_1[k].imag = -1.0f * (data[q] * xcos2[k] + data[p] * xsin2[k]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
769 /* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
770 buf_2[k].real = data[p + 1] * xcos2[k] - data[q + 1] * xsin2[k];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
771 buf_2[k].imag = -1.0f * ( data[q + 1] * xcos2[k] + data[p + 1] * xsin2[k]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
772 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
773
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
774 /* IFFT Bit reversed shuffling */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
775 for(i=0; i<64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
776 k = bit_reverse_256[i];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
777 if (k < i) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
778 swap_cmplx(&buf_1[i],&buf_1[k]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
779 swap_cmplx(&buf_2[i],&buf_2[k]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
780 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
781 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
782
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
783 /* FFT Merge */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
784 for (m=0; m < 6; m++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
785 two_m = (1 << m);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
786 two_m_plus_one = (1 << (m+1));
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
787
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
788 /* FIXME */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
789 if(m)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
790 two_m = (1 << m);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
791 else
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
792 two_m = 1;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
793
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
794 for(k = 0; k < two_m; k++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
795 for(i = 0; i < 64; i += two_m_plus_one) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
796 p = k + i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
797 q = p + two_m;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
798 /* Do block 1 */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
799 tmp_a_r = buf_1[p].real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
800 tmp_a_i = buf_1[p].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
801 tmp_b_r = buf_1[q].real * w[m][k].real - buf_1[q].imag * w[m][k].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
802 tmp_b_i = buf_1[q].imag * w[m][k].real + buf_1[q].real * w[m][k].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
803 buf_1[p].real = tmp_a_r + tmp_b_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
804 buf_1[p].imag = tmp_a_i + tmp_b_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
805 buf_1[q].real = tmp_a_r - tmp_b_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
806 buf_1[q].imag = tmp_a_i - tmp_b_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
807
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
808 /* Do block 2 */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
809 tmp_a_r = buf_2[p].real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
810 tmp_a_i = buf_2[p].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
811 tmp_b_r = buf_2[q].real * w[m][k].real - buf_2[q].imag * w[m][k].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
812 tmp_b_i = buf_2[q].imag * w[m][k].real + buf_2[q].real * w[m][k].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
813 buf_2[p].real = tmp_a_r + tmp_b_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
814 buf_2[p].imag = tmp_a_i + tmp_b_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
815 buf_2[q].real = tmp_a_r - tmp_b_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
816 buf_2[q].imag = tmp_a_i - tmp_b_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
817 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
818 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
819 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
820
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
821 /* Post IFFT complex multiply */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
822 for( i=0; i < 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
823 /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
824 tmp_a_r = buf_1[i].real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
825 tmp_a_i = -buf_1[i].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
826 buf_1[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
827 buf_1[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
828 /* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
829 tmp_a_r = buf_2[i].real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
830 tmp_a_i = -buf_2[i].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
831 buf_2[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
832 buf_2[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
833 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
834
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
835 data_ptr = data;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
836 delay_ptr = delay;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
837 window_ptr = imdct_window;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
838
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
839 /* Window and convert to real valued signal */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
840 for(i=0; i< 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
841 *data_ptr++ = -buf_1[i].imag * *window_ptr++ + *delay_ptr++ + bias;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
842 *data_ptr++ = buf_1[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
843 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
844
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
845 for(i=0; i< 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
846 *data_ptr++ = -buf_1[i].real * *window_ptr++ + *delay_ptr++ + bias;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
847 *data_ptr++ = buf_1[64-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
848 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
849
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
850 delay_ptr = delay;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
851
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
852 for(i=0; i< 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
853 *delay_ptr++ = -buf_2[i].real * *--window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
854 *delay_ptr++ = buf_2[64-i-1].imag * *--window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
855 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
856
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
857 for(i=0; i< 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
858 *delay_ptr++ = buf_2[i].imag * *--window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
859 *delay_ptr++ = -buf_2[64-i-1].real * *--window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
860 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
861 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
862
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
863 void imdct_init (uint32_t mm_accel)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
864 {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
865 #ifdef LIBA52_MLIB
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
866 if (mm_accel & MM_ACCEL_MLIB) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
867 fprintf (stderr, "Using mlib for IMDCT transform\n");
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
868 imdct_512 = imdct_do_512_mlib;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
869 imdct_256 = imdct_do_256_mlib;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
870 } else
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
871 #endif
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
872 {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
873 int i, j, k;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
874
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
875 /* Twiddle factors to turn IFFT into IMDCT */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
876 for (i = 0; i < 128; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
877 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
878 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
879 }
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
880 #ifdef ARCH_X86
3527
5a88b21cfe8a sse opt
michael
parents: 3512
diff changeset
881 for (i = 0; i < 128; i++) {
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
882 sseSinCos1c[2*i+0]= xcos1[i];
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
883 sseSinCos1c[2*i+1]= -xcos1[i];
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
884 sseSinCos1d[2*i+0]= xsin1[i];
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
885 sseSinCos1d[2*i+1]= xsin1[i];
3527
5a88b21cfe8a sse opt
michael
parents: 3512
diff changeset
886 }
5a88b21cfe8a sse opt
michael
parents: 3512
diff changeset
887 #endif
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
888
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
889 /* More twiddle factors to turn IFFT into IMDCT */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
890 for (i = 0; i < 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
891 xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1));
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
892 xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1));
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
893 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
894
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
895 for (i = 0; i < 7; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
896 j = 1 << i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
897 for (k = 0; k < j; k++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
898 w[i][k].real = cos (-M_PI * k / j);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
899 w[i][k].imag = sin (-M_PI * k / j);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
900 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
901 }
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
902 #ifdef ARCH_X86
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
903 for (i = 1; i < 7; i++) {
3483390a902b sse opt
michael
parents: 3529
diff changeset
904 j = 1 << i;
3483390a902b sse opt
michael
parents: 3529
diff changeset
905 for (k = 0; k < j; k+=2) {
3483390a902b sse opt
michael
parents: 3529
diff changeset
906
3483390a902b sse opt
michael
parents: 3529
diff changeset
907 sseW[i][4*k + 0] = w[i][k+0].real;
3483390a902b sse opt
michael
parents: 3529
diff changeset
908 sseW[i][4*k + 1] = w[i][k+0].real;
3483390a902b sse opt
michael
parents: 3529
diff changeset
909 sseW[i][4*k + 2] = w[i][k+1].real;
3483390a902b sse opt
michael
parents: 3529
diff changeset
910 sseW[i][4*k + 3] = w[i][k+1].real;
3483390a902b sse opt
michael
parents: 3529
diff changeset
911
3483390a902b sse opt
michael
parents: 3529
diff changeset
912 sseW[i][4*k + 4] = -w[i][k+0].imag;
3483390a902b sse opt
michael
parents: 3529
diff changeset
913 sseW[i][4*k + 5] = w[i][k+0].imag;
3483390a902b sse opt
michael
parents: 3529
diff changeset
914 sseW[i][4*k + 6] = -w[i][k+1].imag;
3483390a902b sse opt
michael
parents: 3529
diff changeset
915 sseW[i][4*k + 7] = w[i][k+1].imag;
3483390a902b sse opt
michael
parents: 3529
diff changeset
916
3483390a902b sse opt
michael
parents: 3529
diff changeset
917 //we multiply more or less uninitalized numbers so we need to use exactly 0.0
3483390a902b sse opt
michael
parents: 3529
diff changeset
918 if(k==0)
3483390a902b sse opt
michael
parents: 3529
diff changeset
919 {
3483390a902b sse opt
michael
parents: 3529
diff changeset
920 // sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
3483390a902b sse opt
michael
parents: 3529
diff changeset
921 sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
3483390a902b sse opt
michael
parents: 3529
diff changeset
922 }
3483390a902b sse opt
michael
parents: 3529
diff changeset
923
3483390a902b sse opt
michael
parents: 3529
diff changeset
924 if(2*k == j)
3483390a902b sse opt
michael
parents: 3529
diff changeset
925 {
3483390a902b sse opt
michael
parents: 3529
diff changeset
926 sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
3483390a902b sse opt
michael
parents: 3529
diff changeset
927 // sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
3483390a902b sse opt
michael
parents: 3529
diff changeset
928 }
3483390a902b sse opt
michael
parents: 3529
diff changeset
929 }
3483390a902b sse opt
michael
parents: 3529
diff changeset
930 }
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
931
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
932 for(i=0; i<128; i++)
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
933 {
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
934 sseWindow[2*i+0]= -imdct_window[2*i+0];
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
935 sseWindow[2*i+1]= imdct_window[2*i+1];
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
936 }
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
937
a501627fc6db sse opt
michael
parents: 3552
diff changeset
938 for(i=0; i<64; i++)
a501627fc6db sse opt
michael
parents: 3552
diff changeset
939 {
a501627fc6db sse opt
michael
parents: 3552
diff changeset
940 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1];
a501627fc6db sse opt
michael
parents: 3552
diff changeset
941 sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0];
a501627fc6db sse opt
michael
parents: 3552
diff changeset
942 sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1];
a501627fc6db sse opt
michael
parents: 3552
diff changeset
943 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0];
a501627fc6db sse opt
michael
parents: 3552
diff changeset
944 }
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
945 #endif // arch_x86
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
946
3720
120ac80f13c2 Fixed #ifdef discrepancy that was breaking compilation on PPC platform
melanson
parents: 3623
diff changeset
947 imdct_512 = imdct_do_512;
120ac80f13c2 Fixed #ifdef discrepancy that was breaking compilation on PPC platform
melanson
parents: 3623
diff changeset
948 #ifdef ARCH_X86
4497
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
949 if(mm_accel & MM_ACCEL_X86_SSE)
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
950 {
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
951 fprintf (stderr, "Using SSE optimized IMDCT transform\n");
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
952 imdct_512 = imdct_do_512_sse;
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
953 }
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
954 else
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
955 if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
956 {
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
957 fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
958 imdct_512 = imdct_do_512_3dnowex;
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
959 }
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
960 else
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
961 if(mm_accel & MM_ACCEL_X86_3DNOW)
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
962 {
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
963 fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
964 imdct_512 = imdct_do_512_3dnow;
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
965 }
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
966 else
3720
120ac80f13c2 Fixed #ifdef discrepancy that was breaking compilation on PPC platform
melanson
parents: 3623
diff changeset
967 #endif // arch_x86
4497
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
968 fprintf (stderr, "No accelerated IMDCT transform found\n");
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
969 imdct_256 = imdct_do_256;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
970 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
971 }
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
972
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
973 static void fft_asmb(int k, complex_t *x, complex_t *wTB,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
974 const complex_t *d, const complex_t *d_3)
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
975 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
976 register complex_t *x2k, *x3k, *x4k, *wB;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
977 register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
978
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
979 x2k = x + 2 * k;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
980 x3k = x2k + 2 * k;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
981 x4k = x3k + 2 * k;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
982 wB = wTB + 2 * k;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
983
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
984 TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
985 TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
986
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
987 --k;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
988 for(;;) {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
989 TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
990 TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
991 if (!--k) break;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
992 x += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
993 x2k += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
994 x3k += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
995 x4k += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
996 d += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
997 d_3 += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
998 wTB += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
999 wB += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1000 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1001
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1002 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1003
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1004 static void fft_asmb16(complex_t *x, complex_t *wTB)
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1005 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1006 register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1007 int k = 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1008
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1009 /* transform x[0], x[8], x[4], x[12] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1010 TRANSZERO(x[0],x[4],x[8],x[12]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1011
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1012 /* transform x[1], x[9], x[5], x[13] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1013 TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1014
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1015 /* transform x[2], x[10], x[6], x[14] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1016 TRANSHALF_16(x[2],x[6],x[10],x[14]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1017
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1018 /* transform x[3], x[11], x[7], x[15] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1019 TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1020
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1021 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1022
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1023 static void fft_4(complex_t *x)
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1024 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1025 /* delta_p = 1 here */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1026 /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1027 */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1028
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1029 register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1030
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1031 yt_r = x[0].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1032 yb_r = yt_r - x[2].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1033 yt_r += x[2].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1034
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1035 u_r = x[1].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1036 vi_i = x[3].real - u_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1037 u_r += x[3].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1038
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1039 u_i = x[1].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1040 vi_r = u_i - x[3].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1041 u_i += x[3].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1042
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1043 yt_i = yt_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1044 yt_i += u_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1045 x[0].real = yt_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1046 yt_r -= u_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1047 x[2].real = yt_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1048 yt_i = yb_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1049 yt_i += vi_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1050 x[1].real = yt_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1051 yb_r -= vi_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1052 x[3].real = yb_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1053
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1054 yt_i = x[0].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1055 yb_i = yt_i - x[2].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1056 yt_i += x[2].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1057
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1058 yt_r = yt_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1059 yt_r += u_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1060 x[0].imag = yt_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1061 yt_i -= u_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1062 x[2].imag = yt_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1063 yt_r = yb_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1064 yt_r += vi_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1065 x[1].imag = yt_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1066 yb_i -= vi_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1067 x[3].imag = yb_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1068 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1069
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1070
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1071 static void fft_8(complex_t *x)
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1072 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1073 /* delta_p = diag{1, sqrt(i)} here */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1074 /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8}
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1075 */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1076 register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1077
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1078 wT1_r = x[1].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1079 wT1_i = x[1].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1080 wB1_r = x[3].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1081 wB1_i = x[3].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1082
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1083 x[1] = x[2];
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1084 x[2] = x[4];
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1085 x[3] = x[6];
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1086 fft_4(&x[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1087
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1088
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1089 /* x[0] x[4] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1090 wT2_r = x[5].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1091 wT2_r += x[7].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1092 wT2_r += wT1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1093 wT2_r += wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1094 wT2_i = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1095 wT2_r += x[0].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1096 wT2_i = x[0].real - wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1097 x[0].real = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1098 x[4].real = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1099
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1100 wT2_i = x[5].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1101 wT2_i += x[7].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1102 wT2_i += wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1103 wT2_i += wB1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1104 wT2_r = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1105 wT2_r += x[0].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1106 wT2_i = x[0].imag - wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1107 x[0].imag = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1108 x[4].imag = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1109
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1110 /* x[2] x[6] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1111 wT2_r = x[5].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1112 wT2_r -= x[7].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1113 wT2_r += wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1114 wT2_r -= wB1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1115 wT2_i = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1116 wT2_r += x[2].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1117 wT2_i = x[2].real - wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1118 x[2].real = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1119 x[6].real = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1120
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1121 wT2_i = x[5].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1122 wT2_i -= x[7].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1123 wT2_i += wT1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1124 wT2_i -= wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1125 wT2_r = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1126 wT2_r += x[2].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1127 wT2_i = x[2].imag - wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1128 x[2].imag = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1129 x[6].imag = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1130
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1131
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1132 /* x[1] x[5] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1133 wT2_r = wT1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1134 wT2_r += wB1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1135 wT2_r -= x[5].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1136 wT2_r -= x[7].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1137 wT2_i = wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1138 wT2_i -= wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1139 wT2_i -= x[5].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1140 wT2_i += x[7].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1141
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1142 wB2_r = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1143 wB2_r += wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1144 wT2_i -= wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1145 wB2_r *= HSQRT2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1146 wT2_i *= HSQRT2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1147 wT2_r = wB2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1148 wB2_r += x[1].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1149 wT2_r = x[1].real - wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1150
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1151 wB2_i = x[5].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1152 x[1].real = wB2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1153 x[5].real = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1154
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1155 wT2_r = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1156 wT2_r += x[1].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1157 wT2_i = x[1].imag - wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1158 wB2_r = x[5].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1159 x[1].imag = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1160 x[5].imag = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1161
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1162 /* x[3] x[7] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1163 wT1_r -= wB1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1164 wT1_i += wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1165 wB1_r = wB2_i - x[7].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1166 wB1_i = wB2_r + x[7].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1167 wT1_r -= wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1168 wT1_i -= wB1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1169 wB1_r = wT1_r + wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1170 wB1_r *= HSQRT2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1171 wT1_i -= wT1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1172 wT1_i *= HSQRT2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1173 wB2_r = x[3].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1174 wB2_i = wB2_r + wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1175 wB2_r -= wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1176 x[3].real = wB2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1177 x[7].real = wB2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1178 wB2_i = x[3].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1179 wB2_r = wB2_i + wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1180 wB2_i -= wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1181 x[3].imag = wB2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1182 x[7].imag = wB2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1183 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1184
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1185
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1186 static void fft_128p(complex_t *a)
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1187 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1188 fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1189 fft_asmb16(&a[0], &a[8]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1190
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1191 fft_8(&a[16]), fft_8(&a[24]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1192 fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1193
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1194 fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1195 fft_asmb16(&a[32], &a[40]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1196
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1197 fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1198 fft_asmb16(&a[48], &a[56]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1199
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1200 fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1201
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1202 fft_8(&a[64]); fft_4(&a[72]); fft_4(&a[76]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1203 /* fft_16(&a[64]); */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1204 fft_asmb16(&a[64], &a[72]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1205
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1206 fft_8(&a[80]); fft_8(&a[88]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1207
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1208 /* fft_32(&a[64]); */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1209 fft_asmb(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1210
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1211 fft_8(&a[96]); fft_4(&a[104]), fft_4(&a[108]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1212 /* fft_16(&a[96]); */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1213 fft_asmb16(&a[96], &a[104]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1214
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1215 fft_8(&a[112]), fft_8(&a[120]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1216 /* fft_32(&a[96]); */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1217 fft_asmb(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1218
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1219 /* fft_128(&a[0]); */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1220 fft_asmb(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1221 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1222
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1223
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1224