annotate liba52/imdct.c @ 14802:63b1cc62fc9b

MEncoder multiple files patch by Oded Shimon (ods15) Seems to work, or at least not to cause problems with existing functionality (encoding single files). Please test and report bugs, if there are any!
author rfelker
date Fri, 25 Feb 2005 02:32:29 +0000
parents f881c918739b
children 07f1e7669772
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1 /*
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
2 * imdct.c
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
3 * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
5 *
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
6 * This file is part of a52dec, a free ATSC A-52 stream decoder.
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
7 * See http://liba52.sourceforge.net/ for updates.
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
8 *
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
9 * a52dec is free software; you can redistribute it and/or modify
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
10 * it under the terms of the GNU General Public License as published by
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
11 * the Free Software Foundation; either version 2 of the License, or
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
12 * (at your option) any later version.
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
13 *
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
14 * a52dec is distributed in the hope that it will be useful,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
17 * GNU General Public License for more details.
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
18 *
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
19 * You should have received a copy of the GNU General Public License
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
20 * along with this program; if not, write to the Free Software
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
22 *
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
23 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
24 * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
25 * michael did port them from libac3 (untested, perhaps totally broken)
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
26 * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
27 */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
28
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
29 #include "config.h"
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
30
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
31 #include <math.h>
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
32 #include <stdio.h>
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
33 #ifndef M_PI
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
34 #define M_PI 3.1415926535897932384626433832795029
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
35 #endif
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
36 #include <inttypes.h>
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
37
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
38 #include "a52.h"
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
39 #include "a52_internal.h"
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
40 #include "mm_accel.h"
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
41 #include "mangle.h"
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
42
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
43 #ifdef RUNTIME_CPUDETECT
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
44 #undef HAVE_3DNOWEX
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
45 #endif
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
46
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
47 #define USE_AC3_C
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
48
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
49 void (* imdct_256) (sample_t data[], sample_t delay[], sample_t bias);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
50 void (* imdct_512) (sample_t data[], sample_t delay[], sample_t bias);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
51
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
52 typedef struct complex_s {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
53 sample_t real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
54 sample_t imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
55 } complex_t;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
56
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
57 static void fft_128p(complex_t *a);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
58
12303
f881c918739b attribute_used patch by (VMiklos <mamajom at axelero dot hu>)
michael
parents: 9122
diff changeset
59 static const int pm128[128] attribute_used __attribute__((aligned(16))) =
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
60 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
61 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
62 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
63 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
64 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
65 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
66 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
67 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
68 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
69 };
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
70
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
71 /* 128 point bit-reverse LUT */
12303
f881c918739b attribute_used patch by (VMiklos <mamajom at axelero dot hu>)
michael
parents: 9122
diff changeset
72 static uint8_t attribute_used bit_reverse_512[] = {
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
73 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
74 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
75 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
76 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
77 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
78 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
79 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
80 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
81 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
82 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
83 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
84 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
85 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
86 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
87 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
88 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
89
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
90 static uint8_t bit_reverse_256[] = {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
91 0x00, 0x20, 0x10, 0x30, 0x08, 0x28, 0x18, 0x38,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
92 0x04, 0x24, 0x14, 0x34, 0x0c, 0x2c, 0x1c, 0x3c,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
93 0x02, 0x22, 0x12, 0x32, 0x0a, 0x2a, 0x1a, 0x3a,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
94 0x06, 0x26, 0x16, 0x36, 0x0e, 0x2e, 0x1e, 0x3e,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
95 0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
96 0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
97 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
98 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
99
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
100 #ifdef ARCH_X86
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
101 // NOTE: SSE needs 16byte alignment or it will segfault
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
102 //
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
103 static complex_t __attribute__((aligned(16))) buf[128];
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
104 static float __attribute__((aligned(16))) sseSinCos1c[256];
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
105 static float __attribute__((aligned(16))) sseSinCos1d[256];
12303
f881c918739b attribute_used patch by (VMiklos <mamajom at axelero dot hu>)
michael
parents: 9122
diff changeset
106 static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
107 //static float __attribute__((aligned(16))) sseW0[4];
3483390a902b sse opt
michael
parents: 3529
diff changeset
108 static float __attribute__((aligned(16))) sseW1[8];
3483390a902b sse opt
michael
parents: 3529
diff changeset
109 static float __attribute__((aligned(16))) sseW2[16];
3483390a902b sse opt
michael
parents: 3529
diff changeset
110 static float __attribute__((aligned(16))) sseW3[32];
3483390a902b sse opt
michael
parents: 3529
diff changeset
111 static float __attribute__((aligned(16))) sseW4[64];
3483390a902b sse opt
michael
parents: 3529
diff changeset
112 static float __attribute__((aligned(16))) sseW5[128];
3483390a902b sse opt
michael
parents: 3529
diff changeset
113 static float __attribute__((aligned(16))) sseW6[256];
3483390a902b sse opt
michael
parents: 3529
diff changeset
114 static float __attribute__((aligned(16))) *sseW[7]=
3483390a902b sse opt
michael
parents: 3529
diff changeset
115 {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
116 static float __attribute__((aligned(16))) sseWindow[512];
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
117 #else
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
118 static complex_t __attribute__((aligned(16))) buf[128];
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
119 #endif
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
120
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
121 /* Twiddle factor LUT */
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
122 static complex_t __attribute__((aligned(16))) w_1[1];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
123 static complex_t __attribute__((aligned(16))) w_2[2];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
124 static complex_t __attribute__((aligned(16))) w_4[4];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
125 static complex_t __attribute__((aligned(16))) w_8[8];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
126 static complex_t __attribute__((aligned(16))) w_16[16];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
127 static complex_t __attribute__((aligned(16))) w_32[32];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
128 static complex_t __attribute__((aligned(16))) w_64[64];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
129 static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
130
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
131 /* Twiddle factors for IMDCT */
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
132 static sample_t __attribute__((aligned(16))) xcos1[128];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
133 static sample_t __attribute__((aligned(16))) xsin1[128];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
134 static sample_t __attribute__((aligned(16))) xcos2[64];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
135 static sample_t __attribute__((aligned(16))) xsin2[64];
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
136
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
137 /* Windowing function for Modified DCT - Thank you acroread */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
138 sample_t imdct_window[] = {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
139 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
140 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
141 0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
142 0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
143 0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
144 0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
145 0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
146 0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
147 0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
148 0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
149 0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
150 0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
151 0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
152 0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
153 0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
154 0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
155 0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
156 0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
157 0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
158 0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
159 0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
160 0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
161 0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
162 0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
163 0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
164 0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
165 0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
166 0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
167 0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
168 0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
169 0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
170 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 };
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
171
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
172
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
173 static inline void swap_cmplx(complex_t *a, complex_t *b)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
174 {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
175 complex_t tmp;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
176
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
177 tmp = *a;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
178 *a = *b;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
179 *b = tmp;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
180 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
181
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
182
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
183
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
184 static inline complex_t cmplx_mult(complex_t a, complex_t b)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
185 {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
186 complex_t ret;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
187
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
188 ret.real = a.real * b.real - a.imag * b.imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
189 ret.imag = a.real * b.imag + a.imag * b.real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
190
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
191 return ret;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
192 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
193
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
194 void
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
195 imdct_do_512(sample_t data[],sample_t delay[], sample_t bias)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
196 {
8254
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
197 int i;
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
198 #ifndef USE_AC3_C
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
199 int k;
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
200 int p,q;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
201 int m;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
202 int two_m;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
203 int two_m_plus_one;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
204
8254
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
205 sample_t tmp_b_i;
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
206 sample_t tmp_b_r;
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
207 #endif
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
208 sample_t tmp_a_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
209 sample_t tmp_a_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
210
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
211 sample_t *data_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
212 sample_t *delay_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
213 sample_t *window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
214
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
215 /* 512 IMDCT with source and dest data in 'data' */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
216
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
217 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
218 for( i=0; i < 128; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
219 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
220 #ifdef USE_AC3_C
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
221 int j= pm128[i];
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
222 #else
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
223 int j= bit_reverse_512[i];
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
224 #endif
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
225 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
226 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
227 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
228
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
229 /* FFT Merge */
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
230 /* unoptimized variant
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
231 for (m=1; m < 7; m++) {
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
232 if(m)
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
233 two_m = (1 << m);
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
234 else
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
235 two_m = 1;
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
236
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
237 two_m_plus_one = (1 << (m+1));
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
238
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
239 for(i = 0; i < 128; i += two_m_plus_one) {
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
240 for(k = 0; k < two_m; k++) {
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
241 p = k + i;
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
242 q = p + two_m;
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
243 tmp_a_r = buf[p].real;
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
244 tmp_a_i = buf[p].imag;
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
245 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
246 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
247 buf[p].real = tmp_a_r + tmp_b_r;
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
248 buf[p].imag = tmp_a_i + tmp_b_i;
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
249 buf[q].real = tmp_a_r - tmp_b_r;
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
250 buf[q].imag = tmp_a_i - tmp_b_i;
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
251 }
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
252 }
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
253 }
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
254 */
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
255 #ifdef USE_AC3_C
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
256 fft_128p (&buf[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
257 #else
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
258
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
259 /* 1. iteration */
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
260 for(i = 0; i < 128; i += 2) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
261 tmp_a_r = buf[i].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
262 tmp_a_i = buf[i].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
263 tmp_b_r = buf[i+1].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
264 tmp_b_i = buf[i+1].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
265 buf[i].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
266 buf[i].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
267 buf[i+1].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
268 buf[i+1].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
269 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
270
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
271 /* 2. iteration */
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
272 // Note w[1]={{1,0}, {0,-1}}
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
273 for(i = 0; i < 128; i += 4) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
274 tmp_a_r = buf[i].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
275 tmp_a_i = buf[i].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
276 tmp_b_r = buf[i+2].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
277 tmp_b_i = buf[i+2].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
278 buf[i].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
279 buf[i].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
280 buf[i+2].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
281 buf[i+2].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
282 tmp_a_r = buf[i+1].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
283 tmp_a_i = buf[i+1].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
284 tmp_b_r = buf[i+3].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
285 tmp_b_i = buf[i+3].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
286 buf[i+1].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
287 buf[i+1].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
288 buf[i+3].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
289 buf[i+3].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
290 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
291
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
292 /* 3. iteration */
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
293 for(i = 0; i < 128; i += 8) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
294 tmp_a_r = buf[i].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
295 tmp_a_i = buf[i].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
296 tmp_b_r = buf[i+4].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
297 tmp_b_i = buf[i+4].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
298 buf[i].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
299 buf[i].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
300 buf[i+4].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
301 buf[i+4].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
302 tmp_a_r = buf[1+i].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
303 tmp_a_i = buf[1+i].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
304 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
305 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
306 buf[1+i].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
307 buf[1+i].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
308 buf[i+5].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
309 buf[i+5].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
310 tmp_a_r = buf[i+2].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
311 tmp_a_i = buf[i+2].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
312 tmp_b_r = buf[i+6].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
313 tmp_b_i = - buf[i+6].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
314 buf[i+2].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
315 buf[i+2].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
316 buf[i+6].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
317 buf[i+6].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
318 tmp_a_r = buf[i+3].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
319 tmp_a_i = buf[i+3].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
320 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
321 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
322 buf[i+3].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
323 buf[i+3].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
324 buf[i+7].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
325 buf[i+7].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
326 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
327
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
328 /* 4-7. iterations */
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
329 for (m=3; m < 7; m++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
330 two_m = (1 << m);
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
331
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
332 two_m_plus_one = two_m<<1;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
333
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
334 for(i = 0; i < 128; i += two_m_plus_one) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
335 for(k = 0; k < two_m; k++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
336 int p = k + i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
337 int q = p + two_m;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
338 tmp_a_r = buf[p].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
339 tmp_a_i = buf[p].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
340 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
341 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
342 buf[p].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
343 buf[p].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
344 buf[q].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
345 buf[q].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
346 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
347 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
348 }
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
349 #endif
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
350 /* Post IFFT complex multiply plus IFFT complex conjugate*/
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
351 for( i=0; i < 128; i++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
352 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
353 tmp_a_r = buf[i].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
354 tmp_a_i = -1.0 * buf[i].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
355 buf[i].real =(tmp_a_r * xcos1[i]) - (tmp_a_i * xsin1[i]);
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
356 buf[i].imag =(tmp_a_r * xsin1[i]) + (tmp_a_i * xcos1[i]);
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
357 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
358
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
359 data_ptr = data;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
360 delay_ptr = delay;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
361 window_ptr = imdct_window;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
362
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
363 /* Window and convert to real valued signal */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
364 for(i=0; i< 64; i++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
365 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
366 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
367 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
368
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
369 for(i=0; i< 64; i++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
370 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
371 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
372 }
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
373
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
374 /* The trailing edge of the window goes into the delay line */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
375 delay_ptr = delay;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
376
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
377 for(i=0; i< 64; i++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
378 *delay_ptr++ = -buf[64+i].real * *--window_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
379 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
380 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
381
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
382 for(i=0; i<64; i++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
383 *delay_ptr++ = buf[i].imag * *--window_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
384 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
385 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
386 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
387
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
388 #ifdef HAVE_ALTIVEC
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
389
9122
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
390 #ifndef SYS_DARWIN
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
391 #include <altivec.h>
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
392 #endif
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
393
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
394 // used to build registers permutation vectors (vcprm)
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
395 // the 's' are for words in the _s_econd vector
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
396 #define WORD_0 0x00,0x01,0x02,0x03
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
397 #define WORD_1 0x04,0x05,0x06,0x07
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
398 #define WORD_2 0x08,0x09,0x0a,0x0b
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
399 #define WORD_3 0x0c,0x0d,0x0e,0x0f
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
400 #define WORD_s0 0x10,0x11,0x12,0x13
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
401 #define WORD_s1 0x14,0x15,0x16,0x17
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
402 #define WORD_s2 0x18,0x19,0x1a,0x1b
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
403 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
404
9122
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
405 #ifdef SYS_DARWIN
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
406 #define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
9122
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
407 #else
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
408 #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
409 #endif
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
410
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
411 // vcprmle is used to keep the same index as in the SSE version.
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
412 // it's the same as vcprm, with the index inversed
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
413 // ('le' is Little Endian)
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
414 #define vcprmle(a,b,c,d) vcprm(d,c,b,a)
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
415
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
416 // used to build inverse/identity vectors (vcii)
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
417 // n is _n_egative, p is _p_ositive
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
418 #define FLOAT_n -1.
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
419 #define FLOAT_p 1.
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
420
9122
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
421 #ifdef SYS_DARWIN
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
422 #define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
9122
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
423 #else
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
424 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
425 #endif
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
426
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
427 #ifdef SYS_DARWIN
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
428 #define FOUROF(a) (a)
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
429 #else
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
430 #define FOUROF(a) {a,a,a,a}
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
431 #endif
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
432
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
433
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
434 void
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
435 imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
436 {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
437 int i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
438 int k;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
439 int p,q;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
440 int m;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
441 int two_m;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
442 int two_m_plus_one;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
443
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
444 sample_t tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
445 sample_t tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
446 sample_t tmp_a_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
447 sample_t tmp_a_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
448
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
449 sample_t *data_ptr;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
450 sample_t *delay_ptr;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
451 sample_t *window_ptr;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
452
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
453 /* 512 IMDCT with source and dest data in 'data' */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
454
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
455 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
456 for( i=0; i < 128; i++) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
457 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
458 int j= bit_reverse_512[i];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
459 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
460 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
461 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
462
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
463 /* 1. iteration */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
464 for(i = 0; i < 128; i += 2) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
465 #if 0
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
466 tmp_a_r = buf[i].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
467 tmp_a_i = buf[i].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
468 tmp_b_r = buf[i+1].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
469 tmp_b_i = buf[i+1].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
470 buf[i].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
471 buf[i].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
472 buf[i+1].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
473 buf[i+1].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
474 #else
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
475 vector float temp, bufv;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
476
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
477 bufv = vec_ld(i << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
478 temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
479 bufv = vec_madd(bufv, vcii(p,p,n,n), temp);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
480 vec_st(bufv, i << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
481 #endif
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
482 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
483
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
484 /* 2. iteration */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
485 // Note w[1]={{1,0}, {0,-1}}
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
486 for(i = 0; i < 128; i += 4) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
487 #if 0
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
488 tmp_a_r = buf[i].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
489 tmp_a_i = buf[i].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
490 tmp_b_r = buf[i+2].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
491 tmp_b_i = buf[i+2].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
492 buf[i].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
493 buf[i].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
494 buf[i+2].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
495 buf[i+2].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
496 tmp_a_r = buf[i+1].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
497 tmp_a_i = buf[i+1].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
498 /* WARNING: im <-> re here ! */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
499 tmp_b_r = buf[i+3].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
500 tmp_b_i = buf[i+3].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
501 buf[i+1].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
502 buf[i+1].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
503 buf[i+3].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
504 buf[i+3].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
505 #else
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
506 vector float buf01, buf23, temp1, temp2;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
507
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
508 buf01 = vec_ld((i + 0) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
509 buf23 = vec_ld((i + 2) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
510 buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
511
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
512 temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
513 temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
514
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
515 vec_st(temp1, (i + 0) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
516 vec_st(temp2, (i + 2) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
517 #endif
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
518 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
519
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
520 /* 3. iteration */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
521 for(i = 0; i < 128; i += 8) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
522 #if 0
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
523 tmp_a_r = buf[i].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
524 tmp_a_i = buf[i].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
525 tmp_b_r = buf[i+4].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
526 tmp_b_i = buf[i+4].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
527 buf[i].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
528 buf[i].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
529 buf[i+4].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
530 buf[i+4].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
531 tmp_a_r = buf[1+i].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
532 tmp_a_i = buf[1+i].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
533 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
534 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
535 buf[1+i].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
536 buf[1+i].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
537 buf[i+5].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
538 buf[i+5].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
539 tmp_a_r = buf[i+2].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
540 tmp_a_i = buf[i+2].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
541 /* WARNING re <-> im & sign */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
542 tmp_b_r = buf[i+6].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
543 tmp_b_i = - buf[i+6].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
544 buf[i+2].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
545 buf[i+2].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
546 buf[i+6].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
547 buf[i+6].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
548 tmp_a_r = buf[i+3].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
549 tmp_a_i = buf[i+3].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
550 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
551 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
552 buf[i+3].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
553 buf[i+3].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
554 buf[i+7].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
555 buf[i+7].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
556 #else
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
557 vector float buf01, buf23, buf45, buf67;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
558
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
559 buf01 = vec_ld((i + 0) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
560 buf23 = vec_ld((i + 2) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
561
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
562 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
563 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
564 buf[i+5].real = tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
565 buf[i+5].imag = tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
566 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
567 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
568 buf[i+7].real = tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
569 buf[i+7].imag = tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
570
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
571 buf23 = vec_ld((i + 2) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
572 buf45 = vec_ld((i + 4) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
573 buf67 = vec_ld((i + 6) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
574 buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
575
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
576 vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
577 vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
578 vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
579 vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
580 #endif
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
581 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
582
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
583 /* 4-7. iterations */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
584 for (m=3; m < 7; m++) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
585 two_m = (1 << m);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
586
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
587 two_m_plus_one = two_m<<1;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
588
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
589 for(i = 0; i < 128; i += two_m_plus_one) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
590 for(k = 0; k < two_m; k+=2) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
591 #if 0
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
592 int p = k + i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
593 int q = p + two_m;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
594 tmp_a_r = buf[p].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
595 tmp_a_i = buf[p].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
596 tmp_b_r =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
597 buf[q].real * w[m][k].real -
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
598 buf[q].imag * w[m][k].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
599 tmp_b_i =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
600 buf[q].imag * w[m][k].real +
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
601 buf[q].real * w[m][k].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
602 buf[p].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
603 buf[p].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
604 buf[q].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
605 buf[q].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
606
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
607 tmp_a_r = buf[(p + 1)].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
608 tmp_a_i = buf[(p + 1)].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
609 tmp_b_r =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
610 buf[(q + 1)].real * w[m][(k + 1)].real -
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
611 buf[(q + 1)].imag * w[m][(k + 1)].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
612 tmp_b_i =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
613 buf[(q + 1)].imag * w[m][(k + 1)].real +
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
614 buf[(q + 1)].real * w[m][(k + 1)].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
615 buf[(p + 1)].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
616 buf[(p + 1)].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
617 buf[(q + 1)].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
618 buf[(q + 1)].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
619 #else
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
620 int p = k + i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
621 int q = p + two_m;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
622 vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;
9122
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
623 const vector float vczero = (const vector float)FOUROF(0.);
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
624 // first compute buf[q] and buf[q+1]
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
625 vecq = vec_ld(q << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
626 vecw = vec_ld(0, (float*)&(w[m][k]));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
627 temp1 = vec_madd(vecq, vecw, vczero);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
628 temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
629 temp2 = vec_madd(temp2, vecw, vczero);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
630 temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
631 temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
632 vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
633 // then butterfly with buf[p] and buf[p+1]
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
634 vecp = vec_ld(p << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
635
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
636 temp1 = vec_add(vecp, vecq);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
637 temp2 = vec_sub(vecp, vecq);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
638
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
639 vec_st(temp1, p << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
640 vec_st(temp2, q << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
641 #endif
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
642 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
643 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
644 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
645
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
646 /* Post IFFT complex multiply plus IFFT complex conjugate*/
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
647 for( i=0; i < 128; i+=4) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
648 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
649 #if 0
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
650 tmp_a_r = buf[(i + 0)].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
651 tmp_a_i = -1.0 * buf[(i + 0)].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
652 buf[(i + 0)].real =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
653 (tmp_a_r * xcos1[(i + 0)]) - (tmp_a_i * xsin1[(i + 0)]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
654 buf[(i + 0)].imag =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
655 (tmp_a_r * xsin1[(i + 0)]) + (tmp_a_i * xcos1[(i + 0)]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
656
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
657 tmp_a_r = buf[(i + 1)].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
658 tmp_a_i = -1.0 * buf[(i + 1)].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
659 buf[(i + 1)].real =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
660 (tmp_a_r * xcos1[(i + 1)]) - (tmp_a_i * xsin1[(i + 1)]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
661 buf[(i + 1)].imag =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
662 (tmp_a_r * xsin1[(i + 1)]) + (tmp_a_i * xcos1[(i + 1)]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
663
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
664 tmp_a_r = buf[(i + 2)].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
665 tmp_a_i = -1.0 * buf[(i + 2)].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
666 buf[(i + 2)].real =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
667 (tmp_a_r * xcos1[(i + 2)]) - (tmp_a_i * xsin1[(i + 2)]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
668 buf[(i + 2)].imag =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
669 (tmp_a_r * xsin1[(i + 2)]) + (tmp_a_i * xcos1[(i + 2)]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
670
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
671 tmp_a_r = buf[(i + 3)].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
672 tmp_a_i = -1.0 * buf[(i + 3)].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
673 buf[(i + 3)].real =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
674 (tmp_a_r * xcos1[(i + 3)]) - (tmp_a_i * xsin1[(i + 3)]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
675 buf[(i + 3)].imag =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
676 (tmp_a_r * xsin1[(i + 3)]) + (tmp_a_i * xcos1[(i + 3)]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
677 #else
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
678 vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
679 vector float temp0022, temp1133, tempCS01;
9122
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
680 const vector float vczero = (const vector float)FOUROF(0.);
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
681
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
682 bufv_0 = vec_ld((i + 0) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
683 bufv_2 = vec_ld((i + 2) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
684
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
685 cosv = vec_ld(i << 2, xcos1);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
686 sinv = vec_ld(i << 2, xsin1);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
687
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
688 temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
689 temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
690 tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
691 temp1 = vec_madd(temp0022, tempCS01, vczero);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
692 tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
693 temp2 = vec_madd(temp1133, tempCS01, vczero);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
694 bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
695
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
696 vec_st(bufv_0, (i + 0) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
697
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
698 /* idem with bufv_2 and high-order cosv/sinv */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
699
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
700 temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
701 temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
702 tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
703 temp1 = vec_madd(temp0022, tempCS01, vczero);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
704 tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
705 temp2 = vec_madd(temp1133, tempCS01, vczero);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
706 bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
707
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
708 vec_st(bufv_2, (i + 2) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
709
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
710 #endif
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
711 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
712
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
713 data_ptr = data;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
714 delay_ptr = delay;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
715 window_ptr = imdct_window;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
716
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
717 /* Window and convert to real valued signal */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
718 for(i=0; i< 64; i++) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
719 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
720 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
721 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
722
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
723 for(i=0; i< 64; i++) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
724 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
725 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
726 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
727
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
728 /* The trailing edge of the window goes into the delay line */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
729 delay_ptr = delay;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
730
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
731 for(i=0; i< 64; i++) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
732 *delay_ptr++ = -buf[64+i].real * *--window_ptr;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
733 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
734 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
735
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
736 for(i=0; i<64; i++) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
737 *delay_ptr++ = buf[i].imag * *--window_ptr;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
738 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
739 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
740 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
741 #endif
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
742
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
743
4497
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
744 // Stuff below this line is borrowed from libac3
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
745 #include "srfftp.h"
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
746 #ifdef ARCH_X86
4497
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
747 #ifndef HAVE_3DNOW
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
748 #define HAVE_3DNOW 1
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
749 #endif
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
750 #include "srfftp_3dnow.h"
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
751
8451
fb88ccbc5ccc compiler warning fixes
arpi
parents: 8254
diff changeset
752 const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
fb88ccbc5ccc compiler warning fixes
arpi
parents: 8254
diff changeset
753 const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
754 const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
755
4497
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
756 #undef HAVE_3DNOWEX
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
757 #include "imdct_3dnow.h"
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
758 #define HAVE_3DNOWEX
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
759 #include "imdct_3dnow.h"
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
760
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
761 void
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
762 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
763 {
8254
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
764 /* int i,k;
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
765 int p,q;*/
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
766 int m;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
767 int two_m;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
768 int two_m_plus_one;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
769
8254
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
770 /* sample_t tmp_a_i;
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
771 sample_t tmp_a_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
772 sample_t tmp_b_i;
8254
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
773 sample_t tmp_b_r;*/
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
774
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
775 sample_t *data_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
776 sample_t *delay_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
777 sample_t *window_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
778
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
779 /* 512 IMDCT with source and dest data in 'data' */
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
780 /* see the c version (dct_do_512()), its allmost identical, just in C */
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
781
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
782 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
783 /* Bit reversed shuffling */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
784 asm volatile(
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
785 "xorl %%esi, %%esi \n\t"
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
786 "leal "MANGLE(bit_reverse_512)", %%eax \n\t"
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
787 "movl $1008, %%edi \n\t"
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
788 "pushl %%ebp \n\t" //use ebp without telling gcc
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
789 ".balign 16 \n\t"
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
790 "1: \n\t"
3584
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
791 "movlps (%0, %%esi), %%xmm0 \n\t" // XXXI
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
792 "movhps 8(%0, %%edi), %%xmm0 \n\t" // RXXI
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
793 "movlps 8(%0, %%esi), %%xmm1 \n\t" // XXXi
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
794 "movhps (%0, %%edi), %%xmm1 \n\t" // rXXi
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
795 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
796 "movaps "MANGLE(sseSinCos1c)"(%%esi), %%xmm2\n\t"
3584
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
797 "mulps %%xmm0, %%xmm2 \n\t"
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
798 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
799 "mulps "MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"
3584
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
800 "subps %%xmm0, %%xmm2 \n\t"
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
801 "movzbl (%%eax), %%edx \n\t"
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
802 "movzbl 1(%%eax), %%ebp \n\t"
3584
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
803 "movlps %%xmm2, (%1, %%edx,8) \n\t"
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
804 "movhps %%xmm2, (%1, %%ebp,8) \n\t"
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
805 "addl $16, %%esi \n\t"
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
806 "addl $2, %%eax \n\t" // avoid complex addressing for P4 crap
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
807 "subl $16, %%edi \n\t"
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
808 " jnc 1b \n\t"
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
809 "popl %%ebp \n\t"//no we didnt touch ebp *g*
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
810 :: "b" (data), "c" (buf)
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
811 : "%esi", "%edi", "%eax", "%edx"
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
812 );
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
813
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
814
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
815 /* FFT Merge */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
816 /* unoptimized variant
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
817 for (m=1; m < 7; m++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
818 if(m)
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
819 two_m = (1 << m);
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
820 else
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
821 two_m = 1;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
822
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
823 two_m_plus_one = (1 << (m+1));
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
824
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
825 for(i = 0; i < 128; i += two_m_plus_one) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
826 for(k = 0; k < two_m; k++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
827 p = k + i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
828 q = p + two_m;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
829 tmp_a_r = buf[p].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
830 tmp_a_i = buf[p].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
831 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
832 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
833 buf[p].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
834 buf[p].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
835 buf[q].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
836 buf[q].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
837 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
838 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
839 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
840 */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
841
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
842 /* 1. iteration */
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
843 // Note w[0][0]={1,0}
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
844 asm volatile(
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
845 "xorps %%xmm1, %%xmm1 \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
846 "xorps %%xmm2, %%xmm2 \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
847 "movl %0, %%esi \n\t"
3529
a86166b495a6 sse opt
michael
parents: 3527
diff changeset
848 ".balign 16 \n\t"
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
849 "1: \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
850 "movlps (%%esi), %%xmm0 \n\t" //buf[p]
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
851 "movlps 8(%%esi), %%xmm1\n\t" //buf[q]
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
852 "movhps (%%esi), %%xmm0 \n\t" //buf[p]
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
853 "movhps 8(%%esi), %%xmm2\n\t" //buf[q]
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
854 "addps %%xmm1, %%xmm0 \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
855 "subps %%xmm2, %%xmm0 \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
856 "movaps %%xmm0, (%%esi) \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
857 "addl $16, %%esi \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
858 "cmpl %1, %%esi \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
859 " jb 1b \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
860 :: "g" (buf), "r" (buf + 128)
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
861 : "%esi"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
862 );
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
863
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
864 /* 2. iteration */
3512
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
865 // Note w[1]={{1,0}, {0,-1}}
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
866 asm volatile(
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
867 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
3512
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
868 "movl %0, %%esi \n\t"
3529
a86166b495a6 sse opt
michael
parents: 3527
diff changeset
869 ".balign 16 \n\t"
3512
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
870 "1: \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
871 "movaps 16(%%esi), %%xmm2 \n\t" //r2,i2,r3,i3
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
872 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
873 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
874 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
875 "movaps (%%esi), %%xmm1 \n\t" //r0,i0,r1,i1
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
876 "addps %%xmm2, %%xmm0 \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
877 "subps %%xmm2, %%xmm1 \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
878 "movaps %%xmm0, (%%esi) \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
879 "movaps %%xmm1, 16(%%esi) \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
880 "addl $32, %%esi \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
881 "cmpl %1, %%esi \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
882 " jb 1b \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
883 :: "g" (buf), "r" (buf + 128)
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
884 : "%esi"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
885 );
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
886
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
887 /* 3. iteration */
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
888 /*
3483390a902b sse opt
michael
parents: 3529
diff changeset
889 Note sseW2+0={1,1,sqrt(2),sqrt(2))
3483390a902b sse opt
michael
parents: 3529
diff changeset
890 Note sseW2+16={0,0,sqrt(2),-sqrt(2))
3483390a902b sse opt
michael
parents: 3529
diff changeset
891 Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
3483390a902b sse opt
michael
parents: 3529
diff changeset
892 Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
3483390a902b sse opt
michael
parents: 3529
diff changeset
893 */
3483390a902b sse opt
michael
parents: 3529
diff changeset
894 asm volatile(
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
895 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
896 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
897 "xorps %%xmm5, %%xmm5 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
898 "xorps %%xmm2, %%xmm2 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
899 "movl %0, %%esi \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
900 ".balign 16 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
901 "1: \n\t"
3537
d7e5a32643c9 C optimizations
michael
parents: 3534
diff changeset
902 "movaps 32(%%esi), %%xmm2 \n\t" //r4,i4,r5,i5
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
903 "movaps 48(%%esi), %%xmm3 \n\t" //r6,i6,r7,i7
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
904 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
905 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
3537
d7e5a32643c9 C optimizations
michael
parents: 3534
diff changeset
906 "mulps %%xmm2, %%xmm4 \n\t"
d7e5a32643c9 C optimizations
michael
parents: 3534
diff changeset
907 "mulps %%xmm3, %%xmm5 \n\t"
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
908 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5
3483390a902b sse opt
michael
parents: 3529
diff changeset
909 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7
3537
d7e5a32643c9 C optimizations
michael
parents: 3534
diff changeset
910 "mulps %%xmm6, %%xmm3 \n\t"
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
911 "mulps %%xmm7, %%xmm2 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
912 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1
3483390a902b sse opt
michael
parents: 3529
diff changeset
913 "movaps 16(%%esi), %%xmm1 \n\t" //r2,i2,r3,i3
3483390a902b sse opt
michael
parents: 3529
diff changeset
914 "addps %%xmm4, %%xmm2 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
915 "addps %%xmm5, %%xmm3 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
916 "movaps %%xmm2, %%xmm4 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
917 "movaps %%xmm3, %%xmm5 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
918 "addps %%xmm0, %%xmm2 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
919 "addps %%xmm1, %%xmm3 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
920 "subps %%xmm4, %%xmm0 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
921 "subps %%xmm5, %%xmm1 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
922 "movaps %%xmm2, (%%esi) \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
923 "movaps %%xmm3, 16(%%esi) \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
924 "movaps %%xmm0, 32(%%esi) \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
925 "movaps %%xmm1, 48(%%esi) \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
926 "addl $64, %%esi \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
927 "cmpl %1, %%esi \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
928 " jb 1b \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
929 :: "g" (buf), "r" (buf + 128)
3483390a902b sse opt
michael
parents: 3529
diff changeset
930 : "%esi"
3483390a902b sse opt
michael
parents: 3529
diff changeset
931 );
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
932
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
933 /* 4-7. iterations */
3546
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
934 for (m=3; m < 7; m++) {
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
935 two_m = (1 << m);
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
936 two_m_plus_one = two_m<<1;
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
937 asm volatile(
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
938 "movl %0, %%esi \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
939 ".balign 16 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
940 "1: \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
941 "xorl %%edi, %%edi \n\t" // k
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
942 "leal (%%esi, %3), %%edx \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
943 "2: \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
944 "movaps (%%edx, %%edi), %%xmm1 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
945 "movaps (%4, %%edi, 2), %%xmm2 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
946 "mulps %%xmm1, %%xmm2 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
947 "shufps $0xB1, %%xmm1, %%xmm1 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
948 "mulps 16(%4, %%edi, 2), %%xmm1 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
949 "movaps (%%esi, %%edi), %%xmm0 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
950 "addps %%xmm2, %%xmm1 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
951 "movaps %%xmm1, %%xmm2 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
952 "addps %%xmm0, %%xmm1 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
953 "subps %%xmm2, %%xmm0 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
954 "movaps %%xmm1, (%%esi, %%edi) \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
955 "movaps %%xmm0, (%%edx, %%edi) \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
956 "addl $16, %%edi \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
957 "cmpl %3, %%edi \n\t" //FIXME (opt) count against 0
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
958 " jb 2b \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
959 "addl %2, %%esi \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
960 "cmpl %1, %%esi \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
961 " jb 1b \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
962 :: "g" (buf), "m" (buf+128), "m" (two_m_plus_one<<3), "r" (two_m<<3),
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
963 "r" (sseW[m])
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
964 : "%esi", "%edi", "%edx"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
965 );
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
966 }
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
967
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
968 /* Post IFFT complex multiply plus IFFT complex conjugate*/
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
969 asm volatile(
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
970 "movl $-1024, %%esi \n\t"
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
971 ".balign 16 \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
972 "1: \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
973 "movaps (%0, %%esi), %%xmm0 \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
974 "movaps (%0, %%esi), %%xmm1 \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
975 "shufps $0xB1, %%xmm0, %%xmm0 \n\t"
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
976 "mulps 1024+"MANGLE(sseSinCos1c)"(%%esi), %%xmm1\n\t"
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
977 "mulps 1024+"MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
978 "addps %%xmm1, %%xmm0 \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
979 "movaps %%xmm0, (%0, %%esi) \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
980 "addl $16, %%esi \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
981 " jnz 1b \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
982 :: "r" (buf+128)
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
983 : "%esi"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
984 );
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
985
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
986
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
987 data_ptr = data;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
988 delay_ptr = delay;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
989 window_ptr = imdct_window;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
990
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
991 /* Window and convert to real valued signal */
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
992 asm volatile(
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
993 "xorl %%edi, %%edi \n\t" // 0
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
994 "xorl %%esi, %%esi \n\t" // 0
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
995 "movss %3, %%xmm2 \n\t" // bias
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
996 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
997 ".balign 16 \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
998 "1: \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
999 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ?
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1000 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ?
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1001 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ?
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1002 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ?
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1003 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
1004 "mulps "MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1005 "addps (%2, %%esi), %%xmm0 \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1006 "addps %%xmm2, %%xmm0 \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1007 "movaps %%xmm0, (%1, %%esi) \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1008 "addl $16, %%esi \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1009 "subl $16, %%edi \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1010 "cmpl $512, %%esi \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1011 " jb 1b \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1012 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1013 : "%esi", "%edi"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1014 );
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1015 data_ptr+=128;
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1016 delay_ptr+=128;
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1017 // window_ptr+=128;
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
1018
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1019 asm volatile(
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1020 "movl $1024, %%edi \n\t" // 512
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1021 "xorl %%esi, %%esi \n\t" // 0
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1022 "movss %3, %%xmm2 \n\t" // bias
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1023 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1024 ".balign 16 \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1025 "1: \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1026 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1027 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1028 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1029 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1030 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
1031 "mulps 512+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1032 "addps (%2, %%esi), %%xmm0 \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1033 "addps %%xmm2, %%xmm0 \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1034 "movaps %%xmm0, (%1, %%esi) \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1035 "addl $16, %%esi \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1036 "subl $16, %%edi \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1037 "cmpl $512, %%esi \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1038 " jb 1b \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1039 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1040 : "%esi", "%edi"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1041 );
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1042 data_ptr+=128;
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1043 // window_ptr+=128;
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1044
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1045 /* The trailing edge of the window goes into the delay line */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1046 delay_ptr = delay;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1047
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1048 asm volatile(
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1049 "xorl %%edi, %%edi \n\t" // 0
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1050 "xorl %%esi, %%esi \n\t" // 0
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1051 ".balign 16 \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1052 "1: \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1053 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1054 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1055 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1056 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1057 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
1058 "mulps 1024+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1059 "movaps %%xmm0, (%1, %%esi) \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1060 "addl $16, %%esi \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1061 "subl $16, %%edi \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1062 "cmpl $512, %%esi \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1063 " jb 1b \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1064 :: "r" (buf+64), "r" (delay_ptr)
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1065 : "%esi", "%edi"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1066 );
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1067 delay_ptr+=128;
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1068 // window_ptr-=128;
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
1069
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1070 asm volatile(
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1071 "movl $1024, %%edi \n\t" // 1024
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1072 "xorl %%esi, %%esi \n\t" // 0
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1073 ".balign 16 \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1074 "1: \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1075 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ?
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1076 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ?
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1077 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ?
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1078 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ?
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1079 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
1080 "mulps 1536+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1081 "movaps %%xmm0, (%1, %%esi) \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1082 "addl $16, %%esi \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1083 "subl $16, %%edi \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1084 "cmpl $512, %%esi \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1085 " jb 1b \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1086 :: "r" (buf), "r" (delay_ptr)
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1087 : "%esi", "%edi"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1088 );
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1089 }
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
1090 #endif //arch_x86
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1091
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1092 void
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1093 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1094 {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1095 int i,k;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1096 int p,q;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1097 int m;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1098 int two_m;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1099 int two_m_plus_one;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1100
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1101 sample_t tmp_a_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1102 sample_t tmp_a_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1103 sample_t tmp_b_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1104 sample_t tmp_b_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1105
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1106 sample_t *data_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1107 sample_t *delay_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1108 sample_t *window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1109
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1110 complex_t *buf_1, *buf_2;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1111
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1112 buf_1 = &buf[0];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1113 buf_2 = &buf[64];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1114
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1115 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1116 for(k=0; k<64; k++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1117 /* X1[k] = X[2*k] */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1118 /* X2[k] = X[2*k+1] */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1119
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1120 p = 2 * (128-2*k-1);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1121 q = 2 * (2 * k);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1122
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1123 /* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1124 buf_1[k].real = data[p] * xcos2[k] - data[q] * xsin2[k];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1125 buf_1[k].imag = -1.0f * (data[q] * xcos2[k] + data[p] * xsin2[k]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1126 /* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1127 buf_2[k].real = data[p + 1] * xcos2[k] - data[q + 1] * xsin2[k];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1128 buf_2[k].imag = -1.0f * ( data[q + 1] * xcos2[k] + data[p + 1] * xsin2[k]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1129 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1130
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1131 /* IFFT Bit reversed shuffling */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1132 for(i=0; i<64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1133 k = bit_reverse_256[i];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1134 if (k < i) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1135 swap_cmplx(&buf_1[i],&buf_1[k]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1136 swap_cmplx(&buf_2[i],&buf_2[k]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1137 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1138 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1139
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1140 /* FFT Merge */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1141 for (m=0; m < 6; m++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1142 two_m = (1 << m);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1143 two_m_plus_one = (1 << (m+1));
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1144
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1145 /* FIXME */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1146 if(m)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1147 two_m = (1 << m);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1148 else
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1149 two_m = 1;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1150
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1151 for(k = 0; k < two_m; k++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1152 for(i = 0; i < 64; i += two_m_plus_one) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1153 p = k + i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1154 q = p + two_m;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1155 /* Do block 1 */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1156 tmp_a_r = buf_1[p].real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1157 tmp_a_i = buf_1[p].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1158 tmp_b_r = buf_1[q].real * w[m][k].real - buf_1[q].imag * w[m][k].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1159 tmp_b_i = buf_1[q].imag * w[m][k].real + buf_1[q].real * w[m][k].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1160 buf_1[p].real = tmp_a_r + tmp_b_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1161 buf_1[p].imag = tmp_a_i + tmp_b_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1162 buf_1[q].real = tmp_a_r - tmp_b_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1163 buf_1[q].imag = tmp_a_i - tmp_b_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1164
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1165 /* Do block 2 */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1166 tmp_a_r = buf_2[p].real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1167 tmp_a_i = buf_2[p].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1168 tmp_b_r = buf_2[q].real * w[m][k].real - buf_2[q].imag * w[m][k].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1169 tmp_b_i = buf_2[q].imag * w[m][k].real + buf_2[q].real * w[m][k].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1170 buf_2[p].real = tmp_a_r + tmp_b_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1171 buf_2[p].imag = tmp_a_i + tmp_b_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1172 buf_2[q].real = tmp_a_r - tmp_b_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1173 buf_2[q].imag = tmp_a_i - tmp_b_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1174 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1175 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1176 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1177
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1178 /* Post IFFT complex multiply */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1179 for( i=0; i < 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1180 /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1181 tmp_a_r = buf_1[i].real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1182 tmp_a_i = -buf_1[i].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1183 buf_1[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1184 buf_1[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1185 /* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1186 tmp_a_r = buf_2[i].real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1187 tmp_a_i = -buf_2[i].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1188 buf_2[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1189 buf_2[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1190 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1191
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1192 data_ptr = data;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1193 delay_ptr = delay;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1194 window_ptr = imdct_window;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1195
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1196 /* Window and convert to real valued signal */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1197 for(i=0; i< 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1198 *data_ptr++ = -buf_1[i].imag * *window_ptr++ + *delay_ptr++ + bias;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1199 *data_ptr++ = buf_1[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1200 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1201
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1202 for(i=0; i< 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1203 *data_ptr++ = -buf_1[i].real * *window_ptr++ + *delay_ptr++ + bias;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1204 *data_ptr++ = buf_1[64-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1205 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1206
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1207 delay_ptr = delay;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1208
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1209 for(i=0; i< 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1210 *delay_ptr++ = -buf_2[i].real * *--window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1211 *delay_ptr++ = buf_2[64-i-1].imag * *--window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1212 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1213
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1214 for(i=0; i< 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1215 *delay_ptr++ = buf_2[i].imag * *--window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1216 *delay_ptr++ = -buf_2[64-i-1].real * *--window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1217 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1218 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1219
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1220 void imdct_init (uint32_t mm_accel)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1221 {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1222 #ifdef LIBA52_MLIB
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1223 if (mm_accel & MM_ACCEL_MLIB) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1224 fprintf (stderr, "Using mlib for IMDCT transform\n");
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1225 imdct_512 = imdct_do_512_mlib;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1226 imdct_256 = imdct_do_256_mlib;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1227 } else
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1228 #endif
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1229 {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1230 int i, j, k;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1231
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1232 /* Twiddle factors to turn IFFT into IMDCT */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1233 for (i = 0; i < 128; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1234 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1235 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1236 }
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
1237 #ifdef ARCH_X86
3527
5a88b21cfe8a sse opt
michael
parents: 3512
diff changeset
1238 for (i = 0; i < 128; i++) {
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
1239 sseSinCos1c[2*i+0]= xcos1[i];
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
1240 sseSinCos1c[2*i+1]= -xcos1[i];
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
1241 sseSinCos1d[2*i+0]= xsin1[i];
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
1242 sseSinCos1d[2*i+1]= xsin1[i];
3527
5a88b21cfe8a sse opt
michael
parents: 3512
diff changeset
1243 }
5a88b21cfe8a sse opt
michael
parents: 3512
diff changeset
1244 #endif
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1245
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1246 /* More twiddle factors to turn IFFT into IMDCT */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1247 for (i = 0; i < 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1248 xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1));
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1249 xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1));
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1250 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1251
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1252 for (i = 0; i < 7; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1253 j = 1 << i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1254 for (k = 0; k < j; k++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1255 w[i][k].real = cos (-M_PI * k / j);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1256 w[i][k].imag = sin (-M_PI * k / j);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1257 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1258 }
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
1259 #ifdef ARCH_X86
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
1260 for (i = 1; i < 7; i++) {
3483390a902b sse opt
michael
parents: 3529
diff changeset
1261 j = 1 << i;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1262 for (k = 0; k < j; k+=2) {
3483390a902b sse opt
michael
parents: 3529
diff changeset
1263
3483390a902b sse opt
michael
parents: 3529
diff changeset
1264 sseW[i][4*k + 0] = w[i][k+0].real;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1265 sseW[i][4*k + 1] = w[i][k+0].real;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1266 sseW[i][4*k + 2] = w[i][k+1].real;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1267 sseW[i][4*k + 3] = w[i][k+1].real;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1268
3483390a902b sse opt
michael
parents: 3529
diff changeset
1269 sseW[i][4*k + 4] = -w[i][k+0].imag;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1270 sseW[i][4*k + 5] = w[i][k+0].imag;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1271 sseW[i][4*k + 6] = -w[i][k+1].imag;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1272 sseW[i][4*k + 7] = w[i][k+1].imag;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1273
3483390a902b sse opt
michael
parents: 3529
diff changeset
1274 //we multiply more or less uninitalized numbers so we need to use exactly 0.0
3483390a902b sse opt
michael
parents: 3529
diff changeset
1275 if(k==0)
3483390a902b sse opt
michael
parents: 3529
diff changeset
1276 {
3483390a902b sse opt
michael
parents: 3529
diff changeset
1277 // sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1278 sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1279 }
3483390a902b sse opt
michael
parents: 3529
diff changeset
1280
3483390a902b sse opt
michael
parents: 3529
diff changeset
1281 if(2*k == j)
3483390a902b sse opt
michael
parents: 3529
diff changeset
1282 {
3483390a902b sse opt
michael
parents: 3529
diff changeset
1283 sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1284 // sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
3483390a902b sse opt
michael
parents: 3529
diff changeset
1285 }
3483390a902b sse opt
michael
parents: 3529
diff changeset
1286 }
3483390a902b sse opt
michael
parents: 3529
diff changeset
1287 }
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1288
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1289 for(i=0; i<128; i++)
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1290 {
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1291 sseWindow[2*i+0]= -imdct_window[2*i+0];
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1292 sseWindow[2*i+1]= imdct_window[2*i+1];
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1293 }
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1294
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1295 for(i=0; i<64; i++)
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1296 {
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1297 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1];
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1298 sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0];
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1299 sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1];
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1300 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0];
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1301 }
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
1302 #endif // arch_x86
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
1303
3720
120ac80f13c2 Fixed #ifdef discrepancy that was breaking compilation on PPC platform
melanson
parents: 3623
diff changeset
1304 imdct_512 = imdct_do_512;
120ac80f13c2 Fixed #ifdef discrepancy that was breaking compilation on PPC platform
melanson
parents: 3623
diff changeset
1305 #ifdef ARCH_X86
4497
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1306 if(mm_accel & MM_ACCEL_X86_SSE)
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1307 {
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1308 fprintf (stderr, "Using SSE optimized IMDCT transform\n");
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1309 imdct_512 = imdct_do_512_sse;
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1310 }
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1311 else
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1312 if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1313 {
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1314 fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1315 imdct_512 = imdct_do_512_3dnowex;
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1316 }
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1317 else
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1318 if(mm_accel & MM_ACCEL_X86_3DNOW)
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1319 {
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1320 fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1321 imdct_512 = imdct_do_512_3dnow;
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1322 }
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1323 else
3720
120ac80f13c2 Fixed #ifdef discrepancy that was breaking compilation on PPC platform
melanson
parents: 3623
diff changeset
1324 #endif // arch_x86
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
1325 #ifdef HAVE_ALTIVEC
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
1326 if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
1327 {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
1328 fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
1329 imdct_512 = imdct_do_512_altivec;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
1330 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
1331 else
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
1332 #endif
4497
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1333 fprintf (stderr, "No accelerated IMDCT transform found\n");
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1334 imdct_256 = imdct_do_256;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1335 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1336 }
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1337
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1338 static void fft_asmb(int k, complex_t *x, complex_t *wTB,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1339 const complex_t *d, const complex_t *d_3)
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1340 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1341 register complex_t *x2k, *x3k, *x4k, *wB;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1342 register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1343
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1344 x2k = x + 2 * k;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1345 x3k = x2k + 2 * k;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1346 x4k = x3k + 2 * k;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1347 wB = wTB + 2 * k;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1348
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1349 TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1350 TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1351
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1352 --k;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1353 for(;;) {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1354 TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1355 TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1356 if (!--k) break;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1357 x += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1358 x2k += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1359 x3k += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1360 x4k += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1361 d += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1362 d_3 += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1363 wTB += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1364 wB += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1365 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1366
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1367 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1368
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1369 static void fft_asmb16(complex_t *x, complex_t *wTB)
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1370 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1371 register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1372 int k = 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1373
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1374 /* transform x[0], x[8], x[4], x[12] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1375 TRANSZERO(x[0],x[4],x[8],x[12]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1376
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1377 /* transform x[1], x[9], x[5], x[13] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1378 TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1379
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1380 /* transform x[2], x[10], x[6], x[14] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1381 TRANSHALF_16(x[2],x[6],x[10],x[14]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1382
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1383 /* transform x[3], x[11], x[7], x[15] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1384 TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1385
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1386 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1387
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1388 static void fft_4(complex_t *x)
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1389 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1390 /* delta_p = 1 here */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1391 /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1392 */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1393
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1394 register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1395
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1396 yt_r = x[0].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1397 yb_r = yt_r - x[2].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1398 yt_r += x[2].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1399
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1400 u_r = x[1].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1401 vi_i = x[3].real - u_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1402 u_r += x[3].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1403
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1404 u_i = x[1].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1405 vi_r = u_i - x[3].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1406 u_i += x[3].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1407
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1408 yt_i = yt_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1409 yt_i += u_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1410 x[0].real = yt_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1411 yt_r -= u_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1412 x[2].real = yt_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1413 yt_i = yb_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1414 yt_i += vi_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1415 x[1].real = yt_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1416 yb_r -= vi_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1417 x[3].real = yb_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1418
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1419 yt_i = x[0].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1420 yb_i = yt_i - x[2].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1421 yt_i += x[2].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1422
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1423 yt_r = yt_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1424 yt_r += u_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1425 x[0].imag = yt_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1426 yt_i -= u_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1427 x[2].imag = yt_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1428 yt_r = yb_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1429 yt_r += vi_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1430 x[1].imag = yt_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1431 yb_i -= vi_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1432 x[3].imag = yb_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1433 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1434
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1435
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1436 static void fft_8(complex_t *x)
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1437 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1438 /* delta_p = diag{1, sqrt(i)} here */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1439 /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8}
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1440 */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1441 register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1442
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1443 wT1_r = x[1].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1444 wT1_i = x[1].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1445 wB1_r = x[3].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1446 wB1_i = x[3].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1447
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1448 x[1] = x[2];
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1449 x[2] = x[4];
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1450 x[3] = x[6];
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1451 fft_4(&x[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1452
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1453
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1454 /* x[0] x[4] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1455 wT2_r = x[5].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1456 wT2_r += x[7].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1457 wT2_r += wT1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1458 wT2_r += wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1459 wT2_i = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1460 wT2_r += x[0].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1461 wT2_i = x[0].real - wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1462 x[0].real = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1463 x[4].real = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1464
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1465 wT2_i = x[5].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1466 wT2_i += x[7].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1467 wT2_i += wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1468 wT2_i += wB1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1469 wT2_r = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1470 wT2_r += x[0].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1471 wT2_i = x[0].imag - wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1472 x[0].imag = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1473 x[4].imag = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1474
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1475 /* x[2] x[6] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1476 wT2_r = x[5].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1477 wT2_r -= x[7].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1478 wT2_r += wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1479 wT2_r -= wB1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1480 wT2_i = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1481 wT2_r += x[2].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1482 wT2_i = x[2].real - wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1483 x[2].real = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1484 x[6].real = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1485
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1486 wT2_i = x[5].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1487 wT2_i -= x[7].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1488 wT2_i += wT1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1489 wT2_i -= wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1490 wT2_r = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1491 wT2_r += x[2].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1492 wT2_i = x[2].imag - wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1493 x[2].imag = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1494 x[6].imag = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1495
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1496
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1497 /* x[1] x[5] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1498 wT2_r = wT1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1499 wT2_r += wB1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1500 wT2_r -= x[5].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1501 wT2_r -= x[7].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1502 wT2_i = wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1503 wT2_i -= wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1504 wT2_i -= x[5].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1505 wT2_i += x[7].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1506
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1507 wB2_r = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1508 wB2_r += wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1509 wT2_i -= wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1510 wB2_r *= HSQRT2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1511 wT2_i *= HSQRT2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1512 wT2_r = wB2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1513 wB2_r += x[1].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1514 wT2_r = x[1].real - wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1515
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1516 wB2_i = x[5].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1517 x[1].real = wB2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1518 x[5].real = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1519
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1520 wT2_r = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1521 wT2_r += x[1].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1522 wT2_i = x[1].imag - wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1523 wB2_r = x[5].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1524 x[1].imag = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1525 x[5].imag = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1526
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1527 /* x[3] x[7] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1528 wT1_r -= wB1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1529 wT1_i += wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1530 wB1_r = wB2_i - x[7].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1531 wB1_i = wB2_r + x[7].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1532 wT1_r -= wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1533 wT1_i -= wB1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1534 wB1_r = wT1_r + wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1535 wB1_r *= HSQRT2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1536 wT1_i -= wT1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1537 wT1_i *= HSQRT2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1538 wB2_r = x[3].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1539 wB2_i = wB2_r + wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1540 wB2_r -= wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1541 x[3].real = wB2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1542 x[7].real = wB2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1543 wB2_i = x[3].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1544 wB2_r = wB2_i + wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1545 wB2_i -= wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1546 x[3].imag = wB2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1547 x[7].imag = wB2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1548 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1549
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1550
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1551 static void fft_128p(complex_t *a)
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1552 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1553 fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1554 fft_asmb16(&a[0], &a[8]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1555
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1556 fft_8(&a[16]), fft_8(&a[24]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1557 fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1558
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1559 fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1560 fft_asmb16(&a[32], &a[40]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1561
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1562 fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1563 fft_asmb16(&a[48], &a[56]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1564
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1565 fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1566
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1567 fft_8(&a[64]); fft_4(&a[72]); fft_4(&a[76]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1568 /* fft_16(&a[64]); */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1569 fft_asmb16(&a[64], &a[72]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1570
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1571 fft_8(&a[80]); fft_8(&a[88]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1572
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1573 /* fft_32(&a[64]); */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1574 fft_asmb(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1575
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1576 fft_8(&a[96]); fft_4(&a[104]), fft_4(&a[108]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1577 /* fft_16(&a[96]); */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1578 fft_asmb16(&a[96], &a[104]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1579
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1580 fft_8(&a[112]), fft_8(&a[120]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1581 /* fft_32(&a[96]); */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1582 fft_asmb(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1583
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1584 /* fft_128(&a[0]); */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1585 fft_asmb(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1586 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1587
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1588
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1589