annotate liba52/imdct.c @ 16534:cf10f859d829

Lists main A/V codecs supported by MEncoder, talks about how to select an imput file for encoding. Taken from D. Richard Felker III The Great's encoding guide
author gpoirier
date Mon, 19 Sep 2005 21:42:00 +0000
parents 72764c0dad8a
children 7b408d60de9e
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1 /*
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
2 * imdct.c
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
3 * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
5 *
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
6 * This file is part of a52dec, a free ATSC A-52 stream decoder.
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
7 * See http://liba52.sourceforge.net/ for updates.
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
8 *
14991
07f1e7669772 Mark modified files as such to comply more closely with GPL ¡ø2a.
diego
parents: 12303
diff changeset
9 * Modified for use with MPlayer, changes contained in liba52_changes.diff.
07f1e7669772 Mark modified files as such to comply more closely with GPL ¡ø2a.
diego
parents: 12303
diff changeset
10 * detailed CVS changelog at http://www.mplayerhq.hu/cgi-bin/cvsweb.cgi/main/
07f1e7669772 Mark modified files as such to comply more closely with GPL ¡ø2a.
diego
parents: 12303
diff changeset
11 * $Id$
07f1e7669772 Mark modified files as such to comply more closely with GPL ¡ø2a.
diego
parents: 12303
diff changeset
12 *
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
13 * a52dec is free software; you can redistribute it and/or modify
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
14 * it under the terms of the GNU General Public License as published by
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
15 * the Free Software Foundation; either version 2 of the License, or
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
16 * (at your option) any later version.
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
17 *
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
18 * a52dec is distributed in the hope that it will be useful,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
21 * GNU General Public License for more details.
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
22 *
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
23 * You should have received a copy of the GNU General Public License
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
24 * along with this program; if not, write to the Free Software
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
25 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
26 *
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
27 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
28 * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
29 * michael did port them from libac3 (untested, perhaps totally broken)
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
30 * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
31 */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
32
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
33 #include "config.h"
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
34
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
35 #include <math.h>
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
36 #include <stdio.h>
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
37 #ifndef M_PI
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
38 #define M_PI 3.1415926535897932384626433832795029
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
39 #endif
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
40 #include <inttypes.h>
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
41
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
42 #include "a52.h"
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
43 #include "a52_internal.h"
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
44 #include "mm_accel.h"
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
45 #include "mangle.h"
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
46
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
47 #ifdef RUNTIME_CPUDETECT
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
48 #undef HAVE_3DNOWEX
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
49 #endif
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
50
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
51 #define USE_AC3_C
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
52
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
53 void (* imdct_256) (sample_t data[], sample_t delay[], sample_t bias);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
54 void (* imdct_512) (sample_t data[], sample_t delay[], sample_t bias);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
55
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
56 typedef struct complex_s {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
57 sample_t real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
58 sample_t imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
59 } complex_t;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
60
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
61 static void fft_128p(complex_t *a);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
62
12303
f881c918739b attribute_used patch by (VMiklos <mamajom at axelero dot hu>)
michael
parents: 9122
diff changeset
63 static const int pm128[128] attribute_used __attribute__((aligned(16))) =
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
64 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
65 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
66 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
67 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
68 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
69 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
70 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
71 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
72 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
73 };
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
74
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
75 /* 128 point bit-reverse LUT */
12303
f881c918739b attribute_used patch by (VMiklos <mamajom at axelero dot hu>)
michael
parents: 9122
diff changeset
76 static uint8_t attribute_used bit_reverse_512[] = {
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
77 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
78 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
79 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
80 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
81 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
82 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
83 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
84 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
85 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
86 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
87 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
88 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
89 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
90 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
91 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
92 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
93
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
94 static uint8_t bit_reverse_256[] = {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
95 0x00, 0x20, 0x10, 0x30, 0x08, 0x28, 0x18, 0x38,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
96 0x04, 0x24, 0x14, 0x34, 0x0c, 0x2c, 0x1c, 0x3c,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
97 0x02, 0x22, 0x12, 0x32, 0x0a, 0x2a, 0x1a, 0x3a,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
98 0x06, 0x26, 0x16, 0x36, 0x0e, 0x2e, 0x1e, 0x3e,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
99 0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
100 0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
101 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
102 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
103
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
104 #if defined(ARCH_X86) || defined(ARCH_X86_64)
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
105 // NOTE: SSE needs 16byte alignment or it will segfault
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
106 //
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
107 static complex_t __attribute__((aligned(16))) buf[128];
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
108 static float __attribute__((aligned(16))) sseSinCos1c[256];
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
109 static float __attribute__((aligned(16))) sseSinCos1d[256];
12303
f881c918739b attribute_used patch by (VMiklos <mamajom at axelero dot hu>)
michael
parents: 9122
diff changeset
110 static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
111 //static float __attribute__((aligned(16))) sseW0[4];
3483390a902b sse opt
michael
parents: 3529
diff changeset
112 static float __attribute__((aligned(16))) sseW1[8];
3483390a902b sse opt
michael
parents: 3529
diff changeset
113 static float __attribute__((aligned(16))) sseW2[16];
3483390a902b sse opt
michael
parents: 3529
diff changeset
114 static float __attribute__((aligned(16))) sseW3[32];
3483390a902b sse opt
michael
parents: 3529
diff changeset
115 static float __attribute__((aligned(16))) sseW4[64];
3483390a902b sse opt
michael
parents: 3529
diff changeset
116 static float __attribute__((aligned(16))) sseW5[128];
3483390a902b sse opt
michael
parents: 3529
diff changeset
117 static float __attribute__((aligned(16))) sseW6[256];
3483390a902b sse opt
michael
parents: 3529
diff changeset
118 static float __attribute__((aligned(16))) *sseW[7]=
3483390a902b sse opt
michael
parents: 3529
diff changeset
119 {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
120 static float __attribute__((aligned(16))) sseWindow[512];
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
121 #else
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
122 static complex_t __attribute__((aligned(16))) buf[128];
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
123 #endif
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
124
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
125 /* Twiddle factor LUT */
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
126 static complex_t __attribute__((aligned(16))) w_1[1];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
127 static complex_t __attribute__((aligned(16))) w_2[2];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
128 static complex_t __attribute__((aligned(16))) w_4[4];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
129 static complex_t __attribute__((aligned(16))) w_8[8];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
130 static complex_t __attribute__((aligned(16))) w_16[16];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
131 static complex_t __attribute__((aligned(16))) w_32[32];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
132 static complex_t __attribute__((aligned(16))) w_64[64];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
133 static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
134
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
135 /* Twiddle factors for IMDCT */
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
136 static sample_t __attribute__((aligned(16))) xcos1[128];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
137 static sample_t __attribute__((aligned(16))) xsin1[128];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
138 static sample_t __attribute__((aligned(16))) xcos2[64];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
139 static sample_t __attribute__((aligned(16))) xsin2[64];
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
140
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
141 /* Windowing function for Modified DCT - Thank you acroread */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
142 sample_t imdct_window[] = {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
143 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
144 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
145 0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
146 0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
147 0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
148 0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
149 0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
150 0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
151 0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
152 0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
153 0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
154 0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
155 0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
156 0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
157 0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
158 0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
159 0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
160 0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
161 0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
162 0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
163 0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
164 0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
165 0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
166 0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
167 0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
168 0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
169 0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
170 0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
171 0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
172 0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
173 0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
174 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 };
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
175
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
176
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
177 static inline void swap_cmplx(complex_t *a, complex_t *b)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
178 {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
179 complex_t tmp;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
180
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
181 tmp = *a;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
182 *a = *b;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
183 *b = tmp;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
184 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
185
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
186
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
187
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
188 static inline complex_t cmplx_mult(complex_t a, complex_t b)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
189 {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
190 complex_t ret;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
191
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
192 ret.real = a.real * b.real - a.imag * b.imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
193 ret.imag = a.real * b.imag + a.imag * b.real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
194
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
195 return ret;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
196 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
197
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
198 void
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
199 imdct_do_512(sample_t data[],sample_t delay[], sample_t bias)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
200 {
8254
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
201 int i;
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
202 #ifndef USE_AC3_C
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
203 int k;
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
204 int p,q;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
205 int m;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
206 int two_m;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
207 int two_m_plus_one;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
208
8254
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
209 sample_t tmp_b_i;
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
210 sample_t tmp_b_r;
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
211 #endif
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
212 sample_t tmp_a_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
213 sample_t tmp_a_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
214
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
215 sample_t *data_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
216 sample_t *delay_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
217 sample_t *window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
218
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
219 /* 512 IMDCT with source and dest data in 'data' */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
220
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
221 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
222 for( i=0; i < 128; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
223 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
224 #ifdef USE_AC3_C
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
225 int j= pm128[i];
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
226 #else
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
227 int j= bit_reverse_512[i];
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
228 #endif
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
229 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
230 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
231 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
232
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
233 /* FFT Merge */
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
234 /* unoptimized variant
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
235 for (m=1; m < 7; m++) {
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
236 if(m)
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
237 two_m = (1 << m);
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
238 else
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
239 two_m = 1;
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
240
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
241 two_m_plus_one = (1 << (m+1));
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
242
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
243 for(i = 0; i < 128; i += two_m_plus_one) {
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
244 for(k = 0; k < two_m; k++) {
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
245 p = k + i;
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
246 q = p + two_m;
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
247 tmp_a_r = buf[p].real;
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
248 tmp_a_i = buf[p].imag;
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
249 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
250 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
251 buf[p].real = tmp_a_r + tmp_b_r;
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
252 buf[p].imag = tmp_a_i + tmp_b_i;
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
253 buf[q].real = tmp_a_r - tmp_b_r;
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
254 buf[q].imag = tmp_a_i - tmp_b_i;
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
255 }
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
256 }
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
257 }
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
258 */
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
259 #ifdef USE_AC3_C
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
260 fft_128p (&buf[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
261 #else
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
262
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
263 /* 1. iteration */
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
264 for(i = 0; i < 128; i += 2) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
265 tmp_a_r = buf[i].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
266 tmp_a_i = buf[i].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
267 tmp_b_r = buf[i+1].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
268 tmp_b_i = buf[i+1].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
269 buf[i].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
270 buf[i].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
271 buf[i+1].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
272 buf[i+1].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
273 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
274
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
275 /* 2. iteration */
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
276 // Note w[1]={{1,0}, {0,-1}}
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
277 for(i = 0; i < 128; i += 4) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
278 tmp_a_r = buf[i].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
279 tmp_a_i = buf[i].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
280 tmp_b_r = buf[i+2].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
281 tmp_b_i = buf[i+2].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
282 buf[i].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
283 buf[i].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
284 buf[i+2].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
285 buf[i+2].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
286 tmp_a_r = buf[i+1].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
287 tmp_a_i = buf[i+1].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
288 tmp_b_r = buf[i+3].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
289 tmp_b_i = buf[i+3].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
290 buf[i+1].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
291 buf[i+1].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
292 buf[i+3].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
293 buf[i+3].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
294 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
295
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
296 /* 3. iteration */
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
297 for(i = 0; i < 128; i += 8) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
298 tmp_a_r = buf[i].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
299 tmp_a_i = buf[i].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
300 tmp_b_r = buf[i+4].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
301 tmp_b_i = buf[i+4].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
302 buf[i].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
303 buf[i].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
304 buf[i+4].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
305 buf[i+4].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
306 tmp_a_r = buf[1+i].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
307 tmp_a_i = buf[1+i].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
308 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
309 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
310 buf[1+i].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
311 buf[1+i].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
312 buf[i+5].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
313 buf[i+5].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
314 tmp_a_r = buf[i+2].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
315 tmp_a_i = buf[i+2].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
316 tmp_b_r = buf[i+6].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
317 tmp_b_i = - buf[i+6].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
318 buf[i+2].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
319 buf[i+2].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
320 buf[i+6].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
321 buf[i+6].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
322 tmp_a_r = buf[i+3].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
323 tmp_a_i = buf[i+3].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
324 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
325 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
326 buf[i+3].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
327 buf[i+3].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
328 buf[i+7].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
329 buf[i+7].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
330 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
331
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
332 /* 4-7. iterations */
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
333 for (m=3; m < 7; m++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
334 two_m = (1 << m);
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
335
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
336 two_m_plus_one = two_m<<1;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
337
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
338 for(i = 0; i < 128; i += two_m_plus_one) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
339 for(k = 0; k < two_m; k++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
340 int p = k + i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
341 int q = p + two_m;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
342 tmp_a_r = buf[p].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
343 tmp_a_i = buf[p].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
344 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
345 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
346 buf[p].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
347 buf[p].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
348 buf[q].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
349 buf[q].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
350 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
351 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
352 }
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
353 #endif
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
354 /* Post IFFT complex multiply plus IFFT complex conjugate*/
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
355 for( i=0; i < 128; i++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
356 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
357 tmp_a_r = buf[i].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
358 tmp_a_i = -1.0 * buf[i].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
359 buf[i].real =(tmp_a_r * xcos1[i]) - (tmp_a_i * xsin1[i]);
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
360 buf[i].imag =(tmp_a_r * xsin1[i]) + (tmp_a_i * xcos1[i]);
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
361 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
362
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
363 data_ptr = data;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
364 delay_ptr = delay;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
365 window_ptr = imdct_window;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
366
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
367 /* Window and convert to real valued signal */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
368 for(i=0; i< 64; i++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
369 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
370 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
371 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
372
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
373 for(i=0; i< 64; i++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
374 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
375 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
376 }
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
377
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
378 /* The trailing edge of the window goes into the delay line */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
379 delay_ptr = delay;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
380
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
381 for(i=0; i< 64; i++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
382 *delay_ptr++ = -buf[64+i].real * *--window_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
383 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
384 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
385
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
386 for(i=0; i<64; i++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
387 *delay_ptr++ = buf[i].imag * *--window_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
388 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
389 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
390 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
391
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
392 #ifdef HAVE_ALTIVEC
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
393
9122
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
394 #ifndef SYS_DARWIN
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
395 #include <altivec.h>
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
396 #endif
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
397
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
398 // used to build registers permutation vectors (vcprm)
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
399 // the 's' are for words in the _s_econd vector
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
400 #define WORD_0 0x00,0x01,0x02,0x03
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
401 #define WORD_1 0x04,0x05,0x06,0x07
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
402 #define WORD_2 0x08,0x09,0x0a,0x0b
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
403 #define WORD_3 0x0c,0x0d,0x0e,0x0f
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
404 #define WORD_s0 0x10,0x11,0x12,0x13
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
405 #define WORD_s1 0x14,0x15,0x16,0x17
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
406 #define WORD_s2 0x18,0x19,0x1a,0x1b
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
407 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
408
9122
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
409 #ifdef SYS_DARWIN
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
410 #define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
9122
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
411 #else
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
412 #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
413 #endif
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
414
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
415 // vcprmle is used to keep the same index as in the SSE version.
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
416 // it's the same as vcprm, with the index inversed
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
417 // ('le' is Little Endian)
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
418 #define vcprmle(a,b,c,d) vcprm(d,c,b,a)
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
419
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
420 // used to build inverse/identity vectors (vcii)
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
421 // n is _n_egative, p is _p_ositive
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
422 #define FLOAT_n -1.
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
423 #define FLOAT_p 1.
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
424
9122
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
425 #ifdef SYS_DARWIN
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
426 #define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
9122
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
427 #else
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
428 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
429 #endif
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
430
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
431 #ifdef SYS_DARWIN
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
432 #define FOUROF(a) (a)
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
433 #else
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
434 #define FOUROF(a) {a,a,a,a}
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
435 #endif
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
436
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
437
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
438 void
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
439 imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
440 {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
441 int i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
442 int k;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
443 int p,q;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
444 int m;
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
445 long two_m;
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
446 long two_m_plus_one;
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
447
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
448 sample_t tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
449 sample_t tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
450 sample_t tmp_a_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
451 sample_t tmp_a_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
452
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
453 sample_t *data_ptr;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
454 sample_t *delay_ptr;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
455 sample_t *window_ptr;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
456
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
457 /* 512 IMDCT with source and dest data in 'data' */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
458
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
459 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
460 for( i=0; i < 128; i++) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
461 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
462 int j= bit_reverse_512[i];
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
463 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
464 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
465 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
466
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
467 /* 1. iteration */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
468 for(i = 0; i < 128; i += 2) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
469 #if 0
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
470 tmp_a_r = buf[i].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
471 tmp_a_i = buf[i].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
472 tmp_b_r = buf[i+1].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
473 tmp_b_i = buf[i+1].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
474 buf[i].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
475 buf[i].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
476 buf[i+1].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
477 buf[i+1].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
478 #else
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
479 vector float temp, bufv;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
480
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
481 bufv = vec_ld(i << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
482 temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
483 bufv = vec_madd(bufv, vcii(p,p,n,n), temp);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
484 vec_st(bufv, i << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
485 #endif
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
486 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
487
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
488 /* 2. iteration */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
489 // Note w[1]={{1,0}, {0,-1}}
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
490 for(i = 0; i < 128; i += 4) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
491 #if 0
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
492 tmp_a_r = buf[i].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
493 tmp_a_i = buf[i].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
494 tmp_b_r = buf[i+2].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
495 tmp_b_i = buf[i+2].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
496 buf[i].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
497 buf[i].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
498 buf[i+2].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
499 buf[i+2].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
500 tmp_a_r = buf[i+1].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
501 tmp_a_i = buf[i+1].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
502 /* WARNING: im <-> re here ! */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
503 tmp_b_r = buf[i+3].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
504 tmp_b_i = buf[i+3].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
505 buf[i+1].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
506 buf[i+1].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
507 buf[i+3].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
508 buf[i+3].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
509 #else
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
510 vector float buf01, buf23, temp1, temp2;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
511
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
512 buf01 = vec_ld((i + 0) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
513 buf23 = vec_ld((i + 2) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
514 buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
515
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
516 temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
517 temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
518
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
519 vec_st(temp1, (i + 0) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
520 vec_st(temp2, (i + 2) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
521 #endif
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
522 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
523
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
524 /* 3. iteration */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
525 for(i = 0; i < 128; i += 8) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
526 #if 0
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
527 tmp_a_r = buf[i].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
528 tmp_a_i = buf[i].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
529 tmp_b_r = buf[i+4].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
530 tmp_b_i = buf[i+4].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
531 buf[i].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
532 buf[i].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
533 buf[i+4].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
534 buf[i+4].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
535 tmp_a_r = buf[1+i].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
536 tmp_a_i = buf[1+i].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
537 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
538 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
539 buf[1+i].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
540 buf[1+i].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
541 buf[i+5].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
542 buf[i+5].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
543 tmp_a_r = buf[i+2].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
544 tmp_a_i = buf[i+2].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
545 /* WARNING re <-> im & sign */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
546 tmp_b_r = buf[i+6].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
547 tmp_b_i = - buf[i+6].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
548 buf[i+2].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
549 buf[i+2].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
550 buf[i+6].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
551 buf[i+6].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
552 tmp_a_r = buf[i+3].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
553 tmp_a_i = buf[i+3].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
554 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
555 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
556 buf[i+3].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
557 buf[i+3].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
558 buf[i+7].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
559 buf[i+7].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
560 #else
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
561 vector float buf01, buf23, buf45, buf67;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
562
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
563 buf01 = vec_ld((i + 0) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
564 buf23 = vec_ld((i + 2) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
565
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
566 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
567 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
568 buf[i+5].real = tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
569 buf[i+5].imag = tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
570 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
571 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
572 buf[i+7].real = tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
573 buf[i+7].imag = tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
574
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
575 buf23 = vec_ld((i + 2) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
576 buf45 = vec_ld((i + 4) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
577 buf67 = vec_ld((i + 6) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
578 buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
579
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
580 vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
581 vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
582 vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
583 vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
584 #endif
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
585 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
586
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
587 /* 4-7. iterations */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
588 for (m=3; m < 7; m++) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
589 two_m = (1 << m);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
590
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
591 two_m_plus_one = two_m<<1;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
592
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
593 for(i = 0; i < 128; i += two_m_plus_one) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
594 for(k = 0; k < two_m; k+=2) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
595 #if 0
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
596 int p = k + i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
597 int q = p + two_m;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
598 tmp_a_r = buf[p].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
599 tmp_a_i = buf[p].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
600 tmp_b_r =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
601 buf[q].real * w[m][k].real -
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
602 buf[q].imag * w[m][k].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
603 tmp_b_i =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
604 buf[q].imag * w[m][k].real +
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
605 buf[q].real * w[m][k].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
606 buf[p].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
607 buf[p].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
608 buf[q].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
609 buf[q].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
610
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
611 tmp_a_r = buf[(p + 1)].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
612 tmp_a_i = buf[(p + 1)].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
613 tmp_b_r =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
614 buf[(q + 1)].real * w[m][(k + 1)].real -
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
615 buf[(q + 1)].imag * w[m][(k + 1)].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
616 tmp_b_i =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
617 buf[(q + 1)].imag * w[m][(k + 1)].real +
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
618 buf[(q + 1)].real * w[m][(k + 1)].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
619 buf[(p + 1)].real = tmp_a_r + tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
620 buf[(p + 1)].imag = tmp_a_i + tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
621 buf[(q + 1)].real = tmp_a_r - tmp_b_r;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
622 buf[(q + 1)].imag = tmp_a_i - tmp_b_i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
623 #else
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
624 int p = k + i;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
625 int q = p + two_m;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
626 vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;
9122
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
627 const vector float vczero = (const vector float)FOUROF(0.);
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
628 // first compute buf[q] and buf[q+1]
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
629 vecq = vec_ld(q << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
630 vecw = vec_ld(0, (float*)&(w[m][k]));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
631 temp1 = vec_madd(vecq, vecw, vczero);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
632 temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
633 temp2 = vec_madd(temp2, vecw, vczero);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
634 temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
635 temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
636 vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
637 // then butterfly with buf[p] and buf[p+1]
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
638 vecp = vec_ld(p << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
639
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
640 temp1 = vec_add(vecp, vecq);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
641 temp2 = vec_sub(vecp, vecq);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
642
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
643 vec_st(temp1, p << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
644 vec_st(temp2, q << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
645 #endif
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
646 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
647 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
648 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
649
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
650 /* Post IFFT complex multiply plus IFFT complex conjugate*/
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
651 for( i=0; i < 128; i+=4) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
652 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
653 #if 0
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
654 tmp_a_r = buf[(i + 0)].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
655 tmp_a_i = -1.0 * buf[(i + 0)].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
656 buf[(i + 0)].real =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
657 (tmp_a_r * xcos1[(i + 0)]) - (tmp_a_i * xsin1[(i + 0)]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
658 buf[(i + 0)].imag =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
659 (tmp_a_r * xsin1[(i + 0)]) + (tmp_a_i * xcos1[(i + 0)]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
660
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
661 tmp_a_r = buf[(i + 1)].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
662 tmp_a_i = -1.0 * buf[(i + 1)].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
663 buf[(i + 1)].real =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
664 (tmp_a_r * xcos1[(i + 1)]) - (tmp_a_i * xsin1[(i + 1)]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
665 buf[(i + 1)].imag =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
666 (tmp_a_r * xsin1[(i + 1)]) + (tmp_a_i * xcos1[(i + 1)]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
667
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
668 tmp_a_r = buf[(i + 2)].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
669 tmp_a_i = -1.0 * buf[(i + 2)].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
670 buf[(i + 2)].real =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
671 (tmp_a_r * xcos1[(i + 2)]) - (tmp_a_i * xsin1[(i + 2)]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
672 buf[(i + 2)].imag =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
673 (tmp_a_r * xsin1[(i + 2)]) + (tmp_a_i * xcos1[(i + 2)]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
674
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
675 tmp_a_r = buf[(i + 3)].real;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
676 tmp_a_i = -1.0 * buf[(i + 3)].imag;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
677 buf[(i + 3)].real =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
678 (tmp_a_r * xcos1[(i + 3)]) - (tmp_a_i * xsin1[(i + 3)]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
679 buf[(i + 3)].imag =
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
680 (tmp_a_r * xsin1[(i + 3)]) + (tmp_a_i * xcos1[(i + 3)]);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
681 #else
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
682 vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
683 vector float temp0022, temp1133, tempCS01;
9122
5ba896a38d75 The two attached patches *should* allow for proper
arpi
parents: 9001
diff changeset
684 const vector float vczero = (const vector float)FOUROF(0.);
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
685
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
686 bufv_0 = vec_ld((i + 0) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
687 bufv_2 = vec_ld((i + 2) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
688
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
689 cosv = vec_ld(i << 2, xcos1);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
690 sinv = vec_ld(i << 2, xsin1);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
691
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
692 temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
693 temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
694 tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
695 temp1 = vec_madd(temp0022, tempCS01, vczero);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
696 tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
697 temp2 = vec_madd(temp1133, tempCS01, vczero);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
698 bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
699
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
700 vec_st(bufv_0, (i + 0) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
701
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
702 /* idem with bufv_2 and high-order cosv/sinv */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
703
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
704 temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
705 temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
706 tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
707 temp1 = vec_madd(temp0022, tempCS01, vczero);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
708 tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
709 temp2 = vec_madd(temp1133, tempCS01, vczero);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
710 bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
711
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
712 vec_st(bufv_2, (i + 2) << 3, (float*)buf);
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
713
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
714 #endif
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
715 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
716
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
717 data_ptr = data;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
718 delay_ptr = delay;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
719 window_ptr = imdct_window;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
720
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
721 /* Window and convert to real valued signal */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
722 for(i=0; i< 64; i++) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
723 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
724 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
725 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
726
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
727 for(i=0; i< 64; i++) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
728 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
729 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
730 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
731
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
732 /* The trailing edge of the window goes into the delay line */
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
733 delay_ptr = delay;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
734
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
735 for(i=0; i< 64; i++) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
736 *delay_ptr++ = -buf[64+i].real * *--window_ptr;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
737 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
738 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
739
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
740 for(i=0; i<64; i++) {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
741 *delay_ptr++ = buf[i].imag * *--window_ptr;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
742 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
743 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
744 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
745 #endif
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
746
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
747
4497
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
748 // Stuff below this line is borrowed from libac3
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
749 #include "srfftp.h"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
750 #if defined(ARCH_X86) || defined(ARCH_X86_64)
4497
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
751 #ifndef HAVE_3DNOW
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
752 #define HAVE_3DNOW 1
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
753 #endif
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
754 #include "srfftp_3dnow.h"
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
755
8451
fb88ccbc5ccc compiler warning fixes
arpi
parents: 8254
diff changeset
756 const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
fb88ccbc5ccc compiler warning fixes
arpi
parents: 8254
diff changeset
757 const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
758 const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
759
4497
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
760 #undef HAVE_3DNOWEX
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
761 #include "imdct_3dnow.h"
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
762 #define HAVE_3DNOWEX
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
763 #include "imdct_3dnow.h"
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
764
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
765 void
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
766 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
767 {
8254
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
768 /* int i,k;
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
769 int p,q;*/
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
770 int m;
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
771 long two_m;
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
772 long two_m_plus_one;
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
773 long two_m_plus_one_shl3;
15617
130dd060f723 one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents: 14991
diff changeset
774 complex_t *buf_offset;
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
775
8254
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
776 /* sample_t tmp_a_i;
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
777 sample_t tmp_a_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
778 sample_t tmp_b_i;
8254
772d6d27fd66 warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents: 4497
diff changeset
779 sample_t tmp_b_r;*/
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
780
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
781 sample_t *data_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
782 sample_t *delay_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
783 sample_t *window_ptr;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
784
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
785 /* 512 IMDCT with source and dest data in 'data' */
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
786 /* see the c version (dct_do_512()), its allmost identical, just in C */
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
787
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
788 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
789 /* Bit reversed shuffling */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
790 asm volatile(
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
791 "xor %%"REG_S", %%"REG_S" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
792 "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
793 "mov $1008, %%"REG_D" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
794 "push %%"REG_BP" \n\t" //use ebp without telling gcc
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
795 ".balign 16 \n\t"
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
796 "1: \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
797 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
798 "movhps 8(%0, %%"REG_D"), %%xmm0 \n\t" // RXXI
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
799 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // XXXi
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
800 "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi
3584
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
801 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
802 "movaps "MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm2\n\t"
3584
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
803 "mulps %%xmm0, %%xmm2 \n\t"
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
804 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
805 "mulps "MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
3584
7c4046c04be3 removing unnecessary sse sin/cos LUT
michael
parents: 3581
diff changeset
806 "subps %%xmm0, %%xmm2 \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
807 "movzb (%%"REG_a"), %%"REG_d" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
808 "movzb 1(%%"REG_a"), %%"REG_BP" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
809 "movlps %%xmm2, (%1, %%"REG_d", 8) \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
810 "movhps %%xmm2, (%1, %%"REG_BP", 8) \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
811 "add $16, %%"REG_S" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
812 "add $2, %%"REG_a" \n\t" // avoid complex addressing for P4 crap
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
813 "sub $16, %%"REG_D" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
814 "jnc 1b \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
815 "pop %%"REG_BP" \n\t"//no we didnt touch ebp *g*
16189
72764c0dad8a Fixes segfault on IA-32 machines caused by the ASM patch for AMD-64 for a52.
gpoirier
parents: 16173
diff changeset
816 :: "b" (data), "c" (buf)
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
817 : "%"REG_S, "%"REG_D, "%"REG_a, "%"REG_d
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
818 );
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
819
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
820
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
821 /* FFT Merge */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
822 /* unoptimized variant
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
823 for (m=1; m < 7; m++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
824 if(m)
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
825 two_m = (1 << m);
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
826 else
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
827 two_m = 1;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
828
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
829 two_m_plus_one = (1 << (m+1));
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
830
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
831 for(i = 0; i < 128; i += two_m_plus_one) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
832 for(k = 0; k < two_m; k++) {
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
833 p = k + i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
834 q = p + two_m;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
835 tmp_a_r = buf[p].real;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
836 tmp_a_i = buf[p].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
837 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
838 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
839 buf[p].real = tmp_a_r + tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
840 buf[p].imag = tmp_a_i + tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
841 buf[q].real = tmp_a_r - tmp_b_r;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
842 buf[q].imag = tmp_a_i - tmp_b_i;
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
843 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
844 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
845 }
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
846 */
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
847
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
848 /* 1. iteration */
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
849 // Note w[0][0]={1,0}
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
850 asm volatile(
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
851 "xorps %%xmm1, %%xmm1 \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
852 "xorps %%xmm2, %%xmm2 \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
853 "mov %0, %%"REG_S" \n\t"
3529
a86166b495a6 sse opt
michael
parents: 3527
diff changeset
854 ".balign 16 \n\t"
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
855 "1: \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
856 "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p]
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
857 "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q]
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
858 "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p]
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
859 "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q]
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
860 "addps %%xmm1, %%xmm0 \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
861 "subps %%xmm2, %%xmm0 \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
862 "movaps %%xmm0, (%%"REG_S")\n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
863 "add $16, %%"REG_S" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
864 "cmp %1, %%"REG_S" \n\t"
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
865 " jb 1b \n\t"
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
866 :: "g" (buf), "r" (buf + 128)
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
867 : "%"REG_S
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
868 );
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
869
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
870 /* 2. iteration */
3512
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
871 // Note w[1]={{1,0}, {0,-1}}
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
872 asm volatile(
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
873 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
874 "mov %0, %%"REG_S" \n\t"
3529
a86166b495a6 sse opt
michael
parents: 3527
diff changeset
875 ".balign 16 \n\t"
3512
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
876 "1: \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
877 "movaps 16(%%"REG_S"), %%xmm2 \n\t" //r2,i2,r3,i3
3512
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
878 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
879 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
880 "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
881 "movaps (%%"REG_S"), %%xmm1 \n\t" //r0,i0,r1,i1
3512
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
882 "addps %%xmm2, %%xmm0 \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
883 "subps %%xmm2, %%xmm1 \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
884 "movaps %%xmm0, (%%"REG_S") \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
885 "movaps %%xmm1, 16(%%"REG_S") \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
886 "add $32, %%"REG_S" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
887 "cmp %1, %%"REG_S" \n\t"
3512
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
888 " jb 1b \n\t"
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
889 :: "g" (buf), "r" (buf + 128)
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
890 : "%"REG_S
3512
1f166e420b15 a bit more SSE optimizations
michael
parents: 3508
diff changeset
891 );
3549
2e21accd86a8 cleanup
michael
parents: 3546
diff changeset
892
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
893 /* 3. iteration */
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
894 /*
3483390a902b sse opt
michael
parents: 3529
diff changeset
895 Note sseW2+0={1,1,sqrt(2),sqrt(2))
3483390a902b sse opt
michael
parents: 3529
diff changeset
896 Note sseW2+16={0,0,sqrt(2),-sqrt(2))
3483390a902b sse opt
michael
parents: 3529
diff changeset
897 Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
3483390a902b sse opt
michael
parents: 3529
diff changeset
898 Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
3483390a902b sse opt
michael
parents: 3529
diff changeset
899 */
3483390a902b sse opt
michael
parents: 3529
diff changeset
900 asm volatile(
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
901 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
902 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
903 "xorps %%xmm5, %%xmm5 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
904 "xorps %%xmm2, %%xmm2 \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
905 "mov %0, %%"REG_S" \n\t"
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
906 ".balign 16 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
907 "1: \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
908 "movaps 32(%%"REG_S"), %%xmm2 \n\t" //r4,i4,r5,i5
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
909 "movaps 48(%%"REG_S"), %%xmm3 \n\t" //r6,i6,r7,i7
4247
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
910 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
2dbd637ffe05 mangle for win32 in liba52 (includes dummy mangle.h pointing to the one in main)
atmos4
parents: 3908
diff changeset
911 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
3537
d7e5a32643c9 C optimizations
michael
parents: 3534
diff changeset
912 "mulps %%xmm2, %%xmm4 \n\t"
d7e5a32643c9 C optimizations
michael
parents: 3534
diff changeset
913 "mulps %%xmm3, %%xmm5 \n\t"
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
914 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5
3483390a902b sse opt
michael
parents: 3529
diff changeset
915 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7
3537
d7e5a32643c9 C optimizations
michael
parents: 3534
diff changeset
916 "mulps %%xmm6, %%xmm3 \n\t"
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
917 "mulps %%xmm7, %%xmm2 \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
918 "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
919 "movaps 16(%%"REG_S"), %%xmm1 \n\t" //r2,i2,r3,i3
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
920 "addps %%xmm4, %%xmm2 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
921 "addps %%xmm5, %%xmm3 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
922 "movaps %%xmm2, %%xmm4 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
923 "movaps %%xmm3, %%xmm5 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
924 "addps %%xmm0, %%xmm2 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
925 "addps %%xmm1, %%xmm3 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
926 "subps %%xmm4, %%xmm0 \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
927 "subps %%xmm5, %%xmm1 \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
928 "movaps %%xmm2, (%%"REG_S") \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
929 "movaps %%xmm3, 16(%%"REG_S") \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
930 "movaps %%xmm0, 32(%%"REG_S") \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
931 "movaps %%xmm1, 48(%%"REG_S") \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
932 "add $64, %%"REG_S" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
933 "cmp %1, %%"REG_S" \n\t"
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
934 " jb 1b \n\t"
3483390a902b sse opt
michael
parents: 3529
diff changeset
935 :: "g" (buf), "r" (buf + 128)
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
936 : "%"REG_S
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
937 );
3508
b5220cf63fc3 some SSE optimizations
michael
parents: 3394
diff changeset
938
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
939 /* 4-7. iterations */
3546
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
940 for (m=3; m < 7; m++) {
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
941 two_m = (1 << m);
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
942 two_m_plus_one = two_m<<1;
15617
130dd060f723 one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents: 14991
diff changeset
943 two_m_plus_one_shl3 = (two_m_plus_one<<3);
130dd060f723 one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents: 14991
diff changeset
944 buf_offset = buf+128;
3546
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
945 asm volatile(
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
946 "mov %0, %%"REG_S" \n\t"
3546
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
947 ".balign 16 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
948 "1: \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
949 "xor %%"REG_D", %%"REG_D" \n\t" // k
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
950 "lea (%%"REG_S", %3), %%"REG_d" \n\t"
3546
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
951 "2: \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
952 "movaps (%%"REG_d", %%"REG_D"), %%xmm1 \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
953 "movaps (%4, %%"REG_D", 2), %%xmm2 \n\t"
3546
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
954 "mulps %%xmm1, %%xmm2 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
955 "shufps $0xB1, %%xmm1, %%xmm1 \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
956 "mulps 16(%4, %%"REG_D", 2), %%xmm1 \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
957 "movaps (%%"REG_S", %%"REG_D"), %%xmm0 \n\t"
3546
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
958 "addps %%xmm2, %%xmm1 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
959 "movaps %%xmm1, %%xmm2 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
960 "addps %%xmm0, %%xmm1 \n\t"
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
961 "subps %%xmm2, %%xmm0 \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
962 "movaps %%xmm1, (%%"REG_S", %%"REG_D") \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
963 "movaps %%xmm0, (%%"REG_d", %%"REG_D") \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
964 "add $16, %%"REG_D" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
965 "cmp %3, %%"REG_D" \n\t" //FIXME (opt) count against 0
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
966 "jb 2b \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
967 "add %2, %%"REG_S" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
968 "cmp %1, %%"REG_S" \n\t"
3546
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
969 " jb 1b \n\t"
15617
130dd060f723 one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents: 14991
diff changeset
970 :: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3),
3546
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
971 "r" (sseW[m])
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
972 : "%"REG_S, "%"REG_D, "%"REG_d
3546
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
973 );
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
974 }
4e772a3c6b62 sse opt
michael
parents: 3537
diff changeset
975
3623
3f1c2c06d0d8 adding some comments
michael
parents: 3584
diff changeset
976 /* Post IFFT complex multiply plus IFFT complex conjugate*/
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
977 asm volatile(
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
978 "mov $-1024, %%"REG_S" \n\t"
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
979 ".balign 16 \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
980 "1: \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
981 "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
982 "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
983 "shufps $0xB1, %%xmm0, %%xmm0 \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
984 "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
985 "mulps 1024+"MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
986 "addps %%xmm1, %%xmm0 \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
987 "movaps %%xmm0, (%0, %%"REG_S") \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
988 "add $16, %%"REG_S" \n\t"
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
989 " jnz 1b \n\t"
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
990 :: "r" (buf+128)
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
991 : "%"REG_S
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
992 );
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
993
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
994
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
995 data_ptr = data;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
996 delay_ptr = delay;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
997 window_ptr = imdct_window;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
998
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
999 /* Window and convert to real valued signal */
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1000 asm volatile(
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1001 "xor %%"REG_D", %%"REG_D" \n\t" // 0
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1002 "xor %%"REG_S", %%"REG_S" \n\t" // 0
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1003 "movss %3, %%xmm2 \n\t" // bias
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1004 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1005 ".balign 16 \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1006 "1: \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1007 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1008 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ?
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1009 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ?
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1010 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ?
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1011 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1012 "mulps "MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1013 "addps (%2, %%"REG_S"), %%xmm0 \n\t"
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1014 "addps %%xmm2, %%xmm0 \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1015 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1016 "add $16, %%"REG_S" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1017 "sub $16, %%"REG_D" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1018 "cmp $512, %%"REG_S" \n\t"
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1019 " jb 1b \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1020 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1021 : "%"REG_S, "%"REG_D
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1022 );
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1023 data_ptr+=128;
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1024 delay_ptr+=128;
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1025 // window_ptr+=128;
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
1026
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1027 asm volatile(
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1028 "mov $1024, %%"REG_D" \n\t" // 512
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1029 "xor %%"REG_S", %%"REG_S" \n\t" // 0
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1030 "movss %3, %%xmm2 \n\t" // bias
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1031 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1032 ".balign 16 \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1033 "1: \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1034 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1035 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1036 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1037 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1038 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1039 "mulps 512+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1040 "addps (%2, %%"REG_S"), %%xmm0 \n\t"
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1041 "addps %%xmm2, %%xmm0 \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1042 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1043 "add $16, %%"REG_S" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1044 "sub $16, %%"REG_D" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1045 "cmp $512, %%"REG_S" \n\t"
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1046 " jb 1b \n\t"
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1047 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1048 : "%"REG_S, "%"REG_D
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1049 );
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1050 data_ptr+=128;
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1051 // window_ptr+=128;
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1052
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1053 /* The trailing edge of the window goes into the delay line */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1054 delay_ptr = delay;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1055
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1056 asm volatile(
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1057 "xor %%"REG_D", %%"REG_D" \n\t" // 0
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1058 "xor %%"REG_S", %%"REG_S" \n\t" // 0
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1059 ".balign 16 \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1060 "1: \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1061 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1062 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1063 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1064 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1065 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1066 "mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1067 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1068 "add $16, %%"REG_S" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1069 "sub $16, %%"REG_D" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1070 "cmp $512, %%"REG_S" \n\t"
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1071 " jb 1b \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1072 :: "r" (buf+64), "r" (delay_ptr)
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1073 : "%"REG_S, "%"REG_D
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1074 );
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1075 delay_ptr+=128;
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1076 // window_ptr-=128;
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
1077
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1078 asm volatile(
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1079 "mov $1024, %%"REG_D" \n\t" // 1024
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1080 "xor %%"REG_S", %%"REG_S" \n\t" // 0
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1081 ".balign 16 \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1082 "1: \n\t"
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1083 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1084 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ?
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1085 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ?
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1086 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ?
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1087 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1088 "mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1089 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1090 "add $16, %%"REG_S" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1091 "sub $16, %%"REG_D" \n\t"
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1092 "cmp $512, %%"REG_S" \n\t"
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1093 " jb 1b \n\t"
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1094 :: "r" (buf), "r" (delay_ptr)
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1095 : "%"REG_S, "%"REG_D
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1096 );
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1097 }
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1098 #endif // ARCH_X86 || ARCH_X86_64
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1099
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1100 void
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1101 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1102 {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1103 int i,k;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1104 int p,q;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1105 int m;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1106 int two_m;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1107 int two_m_plus_one;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1108
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1109 sample_t tmp_a_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1110 sample_t tmp_a_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1111 sample_t tmp_b_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1112 sample_t tmp_b_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1113
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1114 sample_t *data_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1115 sample_t *delay_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1116 sample_t *window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1117
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1118 complex_t *buf_1, *buf_2;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1119
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1120 buf_1 = &buf[0];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1121 buf_2 = &buf[64];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1122
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1123 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1124 for(k=0; k<64; k++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1125 /* X1[k] = X[2*k] */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1126 /* X2[k] = X[2*k+1] */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1127
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1128 p = 2 * (128-2*k-1);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1129 q = 2 * (2 * k);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1130
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1131 /* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1132 buf_1[k].real = data[p] * xcos2[k] - data[q] * xsin2[k];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1133 buf_1[k].imag = -1.0f * (data[q] * xcos2[k] + data[p] * xsin2[k]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1134 /* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1135 buf_2[k].real = data[p + 1] * xcos2[k] - data[q + 1] * xsin2[k];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1136 buf_2[k].imag = -1.0f * ( data[q + 1] * xcos2[k] + data[p + 1] * xsin2[k]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1137 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1138
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1139 /* IFFT Bit reversed shuffling */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1140 for(i=0; i<64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1141 k = bit_reverse_256[i];
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1142 if (k < i) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1143 swap_cmplx(&buf_1[i],&buf_1[k]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1144 swap_cmplx(&buf_2[i],&buf_2[k]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1145 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1146 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1147
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1148 /* FFT Merge */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1149 for (m=0; m < 6; m++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1150 two_m = (1 << m);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1151 two_m_plus_one = (1 << (m+1));
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1152
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1153 /* FIXME */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1154 if(m)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1155 two_m = (1 << m);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1156 else
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1157 two_m = 1;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1158
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1159 for(k = 0; k < two_m; k++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1160 for(i = 0; i < 64; i += two_m_plus_one) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1161 p = k + i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1162 q = p + two_m;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1163 /* Do block 1 */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1164 tmp_a_r = buf_1[p].real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1165 tmp_a_i = buf_1[p].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1166 tmp_b_r = buf_1[q].real * w[m][k].real - buf_1[q].imag * w[m][k].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1167 tmp_b_i = buf_1[q].imag * w[m][k].real + buf_1[q].real * w[m][k].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1168 buf_1[p].real = tmp_a_r + tmp_b_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1169 buf_1[p].imag = tmp_a_i + tmp_b_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1170 buf_1[q].real = tmp_a_r - tmp_b_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1171 buf_1[q].imag = tmp_a_i - tmp_b_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1172
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1173 /* Do block 2 */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1174 tmp_a_r = buf_2[p].real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1175 tmp_a_i = buf_2[p].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1176 tmp_b_r = buf_2[q].real * w[m][k].real - buf_2[q].imag * w[m][k].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1177 tmp_b_i = buf_2[q].imag * w[m][k].real + buf_2[q].real * w[m][k].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1178 buf_2[p].real = tmp_a_r + tmp_b_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1179 buf_2[p].imag = tmp_a_i + tmp_b_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1180 buf_2[q].real = tmp_a_r - tmp_b_r;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1181 buf_2[q].imag = tmp_a_i - tmp_b_i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1182 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1183 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1184 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1185
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1186 /* Post IFFT complex multiply */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1187 for( i=0; i < 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1188 /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1189 tmp_a_r = buf_1[i].real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1190 tmp_a_i = -buf_1[i].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1191 buf_1[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1192 buf_1[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1193 /* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1194 tmp_a_r = buf_2[i].real;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1195 tmp_a_i = -buf_2[i].imag;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1196 buf_2[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1197 buf_2[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1198 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1199
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1200 data_ptr = data;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1201 delay_ptr = delay;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1202 window_ptr = imdct_window;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1203
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1204 /* Window and convert to real valued signal */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1205 for(i=0; i< 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1206 *data_ptr++ = -buf_1[i].imag * *window_ptr++ + *delay_ptr++ + bias;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1207 *data_ptr++ = buf_1[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1208 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1209
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1210 for(i=0; i< 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1211 *data_ptr++ = -buf_1[i].real * *window_ptr++ + *delay_ptr++ + bias;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1212 *data_ptr++ = buf_1[64-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1213 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1214
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1215 delay_ptr = delay;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1216
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1217 for(i=0; i< 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1218 *delay_ptr++ = -buf_2[i].real * *--window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1219 *delay_ptr++ = buf_2[64-i-1].imag * *--window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1220 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1221
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1222 for(i=0; i< 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1223 *delay_ptr++ = buf_2[i].imag * *--window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1224 *delay_ptr++ = -buf_2[64-i-1].real * *--window_ptr;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1225 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1226 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1227
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1228 void imdct_init (uint32_t mm_accel)
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1229 {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1230 #ifdef LIBA52_MLIB
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1231 if (mm_accel & MM_ACCEL_MLIB) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1232 fprintf (stderr, "Using mlib for IMDCT transform\n");
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1233 imdct_512 = imdct_do_512_mlib;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1234 imdct_256 = imdct_do_256_mlib;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1235 } else
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1236 #endif
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1237 {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1238 int i, j, k;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1239
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1240 /* Twiddle factors to turn IFFT into IMDCT */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1241 for (i = 0; i < 128; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1242 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1243 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1244 }
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1245 #if defined(ARCH_X86) || defined(ARCH_X86_64)
3527
5a88b21cfe8a sse opt
michael
parents: 3512
diff changeset
1246 for (i = 0; i < 128; i++) {
3581
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
1247 sseSinCos1c[2*i+0]= xcos1[i];
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
1248 sseSinCos1c[2*i+1]= -xcos1[i];
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
1249 sseSinCos1d[2*i+0]= xsin1[i];
8ddf654c4871 sse opt
michael
parents: 3579
diff changeset
1250 sseSinCos1d[2*i+1]= xsin1[i];
3527
5a88b21cfe8a sse opt
michael
parents: 3512
diff changeset
1251 }
5a88b21cfe8a sse opt
michael
parents: 3512
diff changeset
1252 #endif
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1253
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1254 /* More twiddle factors to turn IFFT into IMDCT */
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1255 for (i = 0; i < 64; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1256 xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1));
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1257 xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1));
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1258 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1259
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1260 for (i = 0; i < 7; i++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1261 j = 1 << i;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1262 for (k = 0; k < j; k++) {
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1263 w[i][k].real = cos (-M_PI * k / j);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1264 w[i][k].imag = sin (-M_PI * k / j);
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1265 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1266 }
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1267 #if defined(ARCH_X86) || defined(ARCH_X86_64)
3534
3483390a902b sse opt
michael
parents: 3529
diff changeset
1268 for (i = 1; i < 7; i++) {
3483390a902b sse opt
michael
parents: 3529
diff changeset
1269 j = 1 << i;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1270 for (k = 0; k < j; k+=2) {
3483390a902b sse opt
michael
parents: 3529
diff changeset
1271
3483390a902b sse opt
michael
parents: 3529
diff changeset
1272 sseW[i][4*k + 0] = w[i][k+0].real;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1273 sseW[i][4*k + 1] = w[i][k+0].real;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1274 sseW[i][4*k + 2] = w[i][k+1].real;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1275 sseW[i][4*k + 3] = w[i][k+1].real;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1276
3483390a902b sse opt
michael
parents: 3529
diff changeset
1277 sseW[i][4*k + 4] = -w[i][k+0].imag;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1278 sseW[i][4*k + 5] = w[i][k+0].imag;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1279 sseW[i][4*k + 6] = -w[i][k+1].imag;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1280 sseW[i][4*k + 7] = w[i][k+1].imag;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1281
3483390a902b sse opt
michael
parents: 3529
diff changeset
1282 //we multiply more or less uninitalized numbers so we need to use exactly 0.0
3483390a902b sse opt
michael
parents: 3529
diff changeset
1283 if(k==0)
3483390a902b sse opt
michael
parents: 3529
diff changeset
1284 {
3483390a902b sse opt
michael
parents: 3529
diff changeset
1285 // sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1286 sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1287 }
3483390a902b sse opt
michael
parents: 3529
diff changeset
1288
3483390a902b sse opt
michael
parents: 3529
diff changeset
1289 if(2*k == j)
3483390a902b sse opt
michael
parents: 3529
diff changeset
1290 {
3483390a902b sse opt
michael
parents: 3529
diff changeset
1291 sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
3483390a902b sse opt
michael
parents: 3529
diff changeset
1292 // sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
3483390a902b sse opt
michael
parents: 3529
diff changeset
1293 }
3483390a902b sse opt
michael
parents: 3529
diff changeset
1294 }
3483390a902b sse opt
michael
parents: 3529
diff changeset
1295 }
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1296
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1297 for(i=0; i<128; i++)
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1298 {
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1299 sseWindow[2*i+0]= -imdct_window[2*i+0];
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1300 sseWindow[2*i+1]= imdct_window[2*i+1];
3552
9ff2e3801027 sse opt
michael
parents: 3549
diff changeset
1301 }
3553
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1302
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1303 for(i=0; i<64; i++)
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1304 {
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1305 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1];
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1306 sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0];
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1307 sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1];
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1308 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0];
a501627fc6db sse opt
michael
parents: 3552
diff changeset
1309 }
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1310 #endif // ARCH_X86 || ARCH_X86_64
3579
831860fada69 runtime cpu detection for the idct
michael
parents: 3553
diff changeset
1311
3720
120ac80f13c2 Fixed #ifdef discrepancy that was breaking compilation on PPC platform
melanson
parents: 3623
diff changeset
1312 imdct_512 = imdct_do_512;
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1313 #if defined(ARCH_X86) || defined(ARCH_X86_64)
4497
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1314 if(mm_accel & MM_ACCEL_X86_SSE)
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1315 {
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1316 fprintf (stderr, "Using SSE optimized IMDCT transform\n");
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1317 imdct_512 = imdct_do_512_sse;
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1318 }
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1319 else
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1320 if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1321 {
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1322 fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1323 imdct_512 = imdct_do_512_3dnowex;
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1324 }
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1325 else
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1326 if(mm_accel & MM_ACCEL_X86_3DNOW)
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1327 {
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1328 fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1329 imdct_512 = imdct_do_512_3dnow;
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1330 }
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1331 else
16173
d6219ce521e9 liba52 asm optimizations ported to amd64
aurel
parents: 15617
diff changeset
1332 #endif // ARCH_X86 || ARCH_X86_64
9001
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
1333 #ifdef HAVE_ALTIVEC
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
1334 if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
1335 {
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
1336 fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
1337 imdct_512 = imdct_do_512_altivec;
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
1338 }
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
1339 else
01a9cf43074c An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)
arpi
parents: 8451
diff changeset
1340 #endif
4497
d3aedd7db02c Restore K7 support
nick
parents: 4247
diff changeset
1341 fprintf (stderr, "No accelerated IMDCT transform found\n");
3394
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1342 imdct_256 = imdct_do_256;
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1343 }
35b18ed357c2 imported from liba52 CVS
arpi
parents:
diff changeset
1344 }
3884
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1345
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1346 static void fft_asmb(int k, complex_t *x, complex_t *wTB,
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1347 const complex_t *d, const complex_t *d_3)
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1348 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1349 register complex_t *x2k, *x3k, *x4k, *wB;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1350 register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1351
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1352 x2k = x + 2 * k;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1353 x3k = x2k + 2 * k;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1354 x4k = x3k + 2 * k;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1355 wB = wTB + 2 * k;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1356
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1357 TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1358 TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1359
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1360 --k;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1361 for(;;) {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1362 TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1363 TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1364 if (!--k) break;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1365 x += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1366 x2k += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1367 x3k += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1368 x4k += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1369 d += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1370 d_3 += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1371 wTB += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1372 wB += 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1373 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1374
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1375 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1376
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1377 static void fft_asmb16(complex_t *x, complex_t *wTB)
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1378 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1379 register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1380 int k = 2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1381
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1382 /* transform x[0], x[8], x[4], x[12] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1383 TRANSZERO(x[0],x[4],x[8],x[12]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1384
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1385 /* transform x[1], x[9], x[5], x[13] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1386 TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1387
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1388 /* transform x[2], x[10], x[6], x[14] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1389 TRANSHALF_16(x[2],x[6],x[10],x[14]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1390
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1391 /* transform x[3], x[11], x[7], x[15] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1392 TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1393
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1394 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1395
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1396 static void fft_4(complex_t *x)
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1397 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1398 /* delta_p = 1 here */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1399 /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1400 */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1401
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1402 register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1403
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1404 yt_r = x[0].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1405 yb_r = yt_r - x[2].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1406 yt_r += x[2].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1407
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1408 u_r = x[1].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1409 vi_i = x[3].real - u_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1410 u_r += x[3].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1411
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1412 u_i = x[1].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1413 vi_r = u_i - x[3].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1414 u_i += x[3].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1415
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1416 yt_i = yt_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1417 yt_i += u_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1418 x[0].real = yt_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1419 yt_r -= u_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1420 x[2].real = yt_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1421 yt_i = yb_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1422 yt_i += vi_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1423 x[1].real = yt_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1424 yb_r -= vi_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1425 x[3].real = yb_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1426
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1427 yt_i = x[0].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1428 yb_i = yt_i - x[2].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1429 yt_i += x[2].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1430
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1431 yt_r = yt_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1432 yt_r += u_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1433 x[0].imag = yt_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1434 yt_i -= u_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1435 x[2].imag = yt_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1436 yt_r = yb_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1437 yt_r += vi_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1438 x[1].imag = yt_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1439 yb_i -= vi_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1440 x[3].imag = yb_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1441 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1442
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1443
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1444 static void fft_8(complex_t *x)
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1445 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1446 /* delta_p = diag{1, sqrt(i)} here */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1447 /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8}
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1448 */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1449 register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1450
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1451 wT1_r = x[1].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1452 wT1_i = x[1].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1453 wB1_r = x[3].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1454 wB1_i = x[3].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1455
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1456 x[1] = x[2];
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1457 x[2] = x[4];
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1458 x[3] = x[6];
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1459 fft_4(&x[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1460
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1461
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1462 /* x[0] x[4] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1463 wT2_r = x[5].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1464 wT2_r += x[7].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1465 wT2_r += wT1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1466 wT2_r += wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1467 wT2_i = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1468 wT2_r += x[0].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1469 wT2_i = x[0].real - wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1470 x[0].real = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1471 x[4].real = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1472
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1473 wT2_i = x[5].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1474 wT2_i += x[7].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1475 wT2_i += wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1476 wT2_i += wB1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1477 wT2_r = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1478 wT2_r += x[0].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1479 wT2_i = x[0].imag - wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1480 x[0].imag = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1481 x[4].imag = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1482
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1483 /* x[2] x[6] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1484 wT2_r = x[5].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1485 wT2_r -= x[7].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1486 wT2_r += wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1487 wT2_r -= wB1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1488 wT2_i = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1489 wT2_r += x[2].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1490 wT2_i = x[2].real - wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1491 x[2].real = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1492 x[6].real = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1493
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1494 wT2_i = x[5].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1495 wT2_i -= x[7].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1496 wT2_i += wT1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1497 wT2_i -= wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1498 wT2_r = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1499 wT2_r += x[2].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1500 wT2_i = x[2].imag - wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1501 x[2].imag = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1502 x[6].imag = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1503
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1504
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1505 /* x[1] x[5] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1506 wT2_r = wT1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1507 wT2_r += wB1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1508 wT2_r -= x[5].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1509 wT2_r -= x[7].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1510 wT2_i = wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1511 wT2_i -= wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1512 wT2_i -= x[5].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1513 wT2_i += x[7].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1514
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1515 wB2_r = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1516 wB2_r += wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1517 wT2_i -= wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1518 wB2_r *= HSQRT2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1519 wT2_i *= HSQRT2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1520 wT2_r = wB2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1521 wB2_r += x[1].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1522 wT2_r = x[1].real - wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1523
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1524 wB2_i = x[5].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1525 x[1].real = wB2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1526 x[5].real = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1527
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1528 wT2_r = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1529 wT2_r += x[1].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1530 wT2_i = x[1].imag - wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1531 wB2_r = x[5].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1532 x[1].imag = wT2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1533 x[5].imag = wT2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1534
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1535 /* x[3] x[7] */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1536 wT1_r -= wB1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1537 wT1_i += wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1538 wB1_r = wB2_i - x[7].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1539 wB1_i = wB2_r + x[7].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1540 wT1_r -= wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1541 wT1_i -= wB1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1542 wB1_r = wT1_r + wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1543 wB1_r *= HSQRT2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1544 wT1_i -= wT1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1545 wT1_i *= HSQRT2;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1546 wB2_r = x[3].real;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1547 wB2_i = wB2_r + wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1548 wB2_r -= wT1_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1549 x[3].real = wB2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1550 x[7].real = wB2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1551 wB2_i = x[3].imag;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1552 wB2_r = wB2_i + wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1553 wB2_i -= wB1_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1554 x[3].imag = wB2_i;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1555 x[7].imag = wB2_r;
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1556 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1557
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1558
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1559 static void fft_128p(complex_t *a)
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1560 {
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1561 fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1562 fft_asmb16(&a[0], &a[8]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1563
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1564 fft_8(&a[16]), fft_8(&a[24]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1565 fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1566
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1567 fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1568 fft_asmb16(&a[32], &a[40]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1569
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1570 fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1571 fft_asmb16(&a[48], &a[56]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1572
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1573 fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1574
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1575 fft_8(&a[64]); fft_4(&a[72]); fft_4(&a[76]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1576 /* fft_16(&a[64]); */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1577 fft_asmb16(&a[64], &a[72]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1578
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1579 fft_8(&a[80]); fft_8(&a[88]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1580
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1581 /* fft_32(&a[64]); */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1582 fft_asmb(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1583
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1584 fft_8(&a[96]); fft_4(&a[104]), fft_4(&a[108]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1585 /* fft_16(&a[96]); */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1586 fft_asmb16(&a[96], &a[104]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1587
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1588 fft_8(&a[112]), fft_8(&a[120]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1589 /* fft_32(&a[96]); */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1590 fft_asmb(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1591
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1592 /* fft_128(&a[0]); */
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1593 fft_asmb(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1594 }
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1595
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1596
0410677eda4a ported 3dnow(ex) optimizations from libac3
michael
parents: 3720
diff changeset
1597