Mercurial > mplayer.hg
annotate liba52/srfftp_3dnow.h @ 956:a6cecd9a1bad
'-ao' switch (including '-ao help'), fixing Arpi's bug (short name 'null' for both of oss and null driver ;)
author | lgb |
---|---|
date | Sun, 03 Jun 2001 00:24:49 +0000 |
parents | d44a690543ac |
children | 970fbd433564 |
rev | line source |
---|---|
885 | 1 /* |
2 * srfftp.h | |
3 * | |
4 * Copyright (C) Yuqing Deng <Yuqing_Deng@brown.edu> - April 2000 | |
5 * | |
6 * 64 and 128 point split radix fft for ac3dec | |
7 * | |
8 * The algorithm is desribed in the book: | |
9 * "Computational Frameworks of the Fast Fourier Transform". | |
10 * | |
11 * The ideas and the the organization of code borrowed from djbfft written by | |
12 * D. J. Bernstein <djb@cr.py.to>. djbff can be found at | |
13 * http://cr.yp.to/djbfft.html. | |
14 * | |
15 * srfftp.h is free software; you can redistribute it and/or modify | |
16 * it under the terms of the GNU General Public License as published by | |
17 * the Free Software Foundation; either version 2, or (at your option) | |
18 * any later version. | |
19 * | |
20 * srfftp.h is distributed in the hope that it will be useful, | |
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
23 * GNU General Public License for more details. | |
24 * | |
25 * You should have received a copy of the GNU General Public License | |
26 * along with GNU Make; see the file COPYING. If not, write to | |
27 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | |
28 * | |
29 * Modified for using AMD's 3DNow! - 3DNowEx(DSP)! SIMD operations | |
30 * by Nick Kurshev <nickols_k@mail.ru> | |
31 */ | |
32 | |
33 #ifndef SRFFTP_3DNOW_H__ | |
34 #define SRFFTP_3DNOW_H__ | |
35 | |
890 | 36 static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 }; |
886 | 37 |
885 | 38 #ifdef HAVE_3DNOWEX |
39 #define TRANS_FILL_MM6_MM7_3DNOW()\ | |
890 | 40 __asm__ __volatile__(\ |
885 | 41 "movl $-1, %%eax\n\t"\ |
42 "movd %%eax, %%mm7\n\t"\ | |
43 "negl %%eax\n\t"\ | |
44 "movd %%eax, %%mm6\n\t"\ | |
45 "punpckldq %%mm6, %%mm7\n\t" /* -1.0 | 1.0 */\ | |
46 "pi2fd %%mm7, %%mm7\n\t"\ | |
47 "pswapd %%mm7, %%mm6\n\t"/* 1.0 | -1.0 */\ | |
48 :::"eax","memory"); | |
49 #else | |
50 #define TRANS_FILL_MM6_MM7_3DNOW()\ | |
890 | 51 __asm__ __volatile__(\ |
885 | 52 "movl $-1, %%eax\n\t"\ |
53 "movd %%eax, %%mm7\n\t"\ | |
54 "negl %%eax\n\t"\ | |
55 "movd %%eax, %%mm6\n\t"\ | |
56 "punpckldq %%mm6, %%mm7\n\t" /* -1.0 | 1.0 */\ | |
57 "punpckldq %%mm7, %%mm6\n\t" /* 1.0 | -1.0 */\ | |
58 "pi2fd %%mm7, %%mm7\n\t"\ | |
59 "pi2fd %%mm6, %%mm6\n\t"\ | |
60 :::"eax","memory"); | |
61 #endif | |
62 | |
63 #ifdef HAVE_3DNOWEX | |
886 | 64 #define PSWAP_MM(mm_base,mm_hlp) "pswapd "##mm_base","##mm_base"\n\t" |
885 | 65 #else |
66 #define PSWAP_MM(mm_base,mm_hlp)\ | |
886 | 67 "movq "##mm_base","##mm_hlp"\n\t"\ |
885 | 68 "psrlq $32, "##mm_base"\n\t"\ |
69 "punpckldq "##mm_hlp","##mm_base"\n\t" | |
70 #endif | |
920 | 71 #ifdef HAVE_3DNOWEX |
72 #define PFNACC_MM(mm_base,mm_hlp) "pfnacc "##mm_base","##mm_base"\n\t" | |
73 #else | |
74 #define PFNACC_MM(mm_base,mm_hlp)\ | |
75 "movq "##mm_base","##mm_hlp"\n\t"\ | |
76 "psrlq $32,"##mm_hlp"\n\t"\ | |
77 "punpckldq "##mm_hlp","##mm_hlp"\n\t"\ | |
78 "pfsub "##mm_hlp","##mm_base"\n\t" | |
79 #endif | |
885 | 80 |
81 #define TRANSZERO_3DNOW(A0,A4,A8,A12) \ | |
82 { \ | |
890 | 83 __asm__ __volatile__(\ |
885 | 84 "movq %4, %%mm0\n\t" /* mm0 = wTB[0]*/\ |
85 "movq %5, %%mm1\n\t" /* mm1 = wTB[k*2]*/ \ | |
86 "movq %%mm0, %%mm5\n\t"/*u.re = wTB[0].re + wTB[k*2].re;*/\ | |
87 "pfadd %%mm1, %%mm5\n\t"/*u.im = wTB[0].im + wTB[k*2].im; mm5 = u*/\ | |
88 "pfmul %%mm6, %%mm0\n\t"/*mm0 = wTB[0].re | -wTB[0].im */\ | |
89 "pfmul %%mm7, %%mm1\n\t"/*mm1 = -wTB[k*2].re | wTB[k*2].im */\ | |
90 "pfadd %%mm1, %%mm0\n\t"/*v.im = wTB[0].re - wTB[k*2].re;*/\ | |
91 "movq %%mm0, %%mm4\n\t"/*v.re =-wTB[0].im + wTB[k*2].im;*/\ | |
92 PSWAP_MM("%%mm4","%%mm2")/* mm4 = v*/\ | |
93 "movq %6, %%mm0\n\t" /* a1 = A0;*/\ | |
886 | 94 "movq %7, %%mm2\n\t" /* a1 = A4;*/\ |
885 | 95 "movq %%mm0, %%mm1\n\t"\ |
886 | 96 "movq %%mm2, %%mm3\n\t"\ |
885 | 97 "pfadd %%mm5, %%mm0\n\t" /*A0 = a1 + u;*/\ |
886 | 98 "pfadd %%mm4, %%mm2\n\t" /*A12 = a1 + v;*/\ |
885 | 99 "pfsub %%mm5, %%mm1\n\t" /*A1 = a1 - u;*/\ |
886 | 100 "pfsub %%mm4, %%mm3\n\t" /*A4 = a1 - v;*/\ |
885 | 101 "movq %%mm0, %0\n\t"\ |
886 | 102 "movq %%mm2, %3\n\t"\ |
885 | 103 "movq %%mm1, %1\n\t"\ |
104 "movq %%mm3, %2"\ | |
105 :"=m"(A0), "=m"(A8), "=m"(A4), "=m"(A12)\ | |
106 :"m"(wTB[0]), "m"(wTB[k*2]), "0"(A0), "2"(A4)\ | |
107 :"memory");\ | |
108 } | |
109 | |
886 | 110 #define TRANSHALF_16_3DNOW(A2,A6,A10,A14)\ |
111 {\ | |
890 | 112 __asm__ __volatile__(\ |
886 | 113 "movq %4, %%mm0\n\t"/*u.re = wTB[2].im + wTB[2].re;*/\ |
114 "movq %%mm0, %%mm1\n\t"\ | |
115 "pfmul %%mm7, %%mm1\n\t"\ | |
116 "pfacc %%mm1, %%mm0\n\t"/*u.im = wTB[2].im - wTB[2].re; mm0 = u*/\ | |
117 "movq %5, %%mm1\n\t" /*a.re = wTB[6].im - wTB[6].re; */\ | |
118 "movq %%mm1, %%mm2\n\t"\ | |
119 "pfmul %%mm7, %%mm1\n\t"\ | |
120 "pfacc %%mm2, %%mm1\n\t"/*a.im = wTB[6].im + wTB[6].re; mm1 = a*/\ | |
121 "movq %%mm1, %%mm2\n\t"\ | |
122 "pfmul %%mm7, %%mm2\n\t"/*v.im = u.re - a.re;*/\ | |
123 "movq %%mm0, %%mm3\n\t"/*v.re = u.im + a.im;*/\ | |
124 "pfadd %%mm2, %%mm3\n\t"\ | |
125 PSWAP_MM("%%mm3","%%mm2")/*mm3 = v*/\ | |
126 "pfmul %%mm6, %%mm1\n\t"/*u.re = u.re + a.re;*/\ | |
127 "pfadd %%mm1, %%mm0\n\t"/*u.im = u.im - a.im; mm0 = u*/\ | |
890 | 128 "movq %8, %%mm2\n\t"\ |
886 | 129 "pfmul %%mm2, %%mm3\n\t" /* v *= HSQRT2_3DNOW; */\ |
130 "pfmul %%mm2, %%mm0\n\t" /* u *= HSQRT2_3DNOW; */\ | |
131 "movq %6, %%mm1\n\t" /* a1 = A2;*/\ | |
926
d44a690543ac
Better insns scheduling and moving out local variables
nickols_k
parents:
920
diff
changeset
|
132 "movq %7, %%mm5\n\t" /* a1 = A6;*/\ |
886 | 133 "movq %%mm1, %%mm2\n\t"\ |
926
d44a690543ac
Better insns scheduling and moving out local variables
nickols_k
parents:
920
diff
changeset
|
134 "movq %%mm3, %%mm4\n\t"\ |
886 | 135 "pfadd %%mm0, %%mm1\n\t" /*A2 = a1 + u;*/\ |
926
d44a690543ac
Better insns scheduling and moving out local variables
nickols_k
parents:
920
diff
changeset
|
136 "pfmul %%mm6, %%mm4\n\t"/*A6.re = a1.re + v.re;*/\ |
886 | 137 "pfsub %%mm0, %%mm2\n\t" /*A2 = a1 - u;*/\ |
926
d44a690543ac
Better insns scheduling and moving out local variables
nickols_k
parents:
920
diff
changeset
|
138 "pfmul %%mm7, %%mm3\n\t"/*A14.re = a1.re - v.re;*/\ |
886 | 139 "movq %%mm1, %0\n\t"\ |
140 "movq %%mm2, %1\n\t"\ | |
926
d44a690543ac
Better insns scheduling and moving out local variables
nickols_k
parents:
920
diff
changeset
|
141 "movq %%mm5, %%mm2\n\t"\ |
d44a690543ac
Better insns scheduling and moving out local variables
nickols_k
parents:
920
diff
changeset
|
142 "pfadd %%mm4, %%mm5\n\t"/*A6.im = a1.im - v.im;*/\ |
886 | 143 "pfadd %%mm3, %%mm2\n\t"/*A14.im = a1.im + v.im;*/\ |
926
d44a690543ac
Better insns scheduling and moving out local variables
nickols_k
parents:
920
diff
changeset
|
144 "movq %%mm5, %2\n\t"\ |
886 | 145 "movq %%mm2, %3"\ |
146 :"=m"(A2), "=m"(A10), "=m"(A6), "=m"(A14)\ | |
147 :"m"(wTB[2]), "m"(wTB[6]), "0"(A2), "2"(A6), "m"(HSQRT2_3DNOW)\ | |
148 :"memory");\ | |
920 | 149 } |
150 | |
151 #define TRANS_3DNOW(A1,A5,A9,A13,WT,WB,D,D3)\ | |
152 { \ | |
153 __asm__ __volatile__(\ | |
154 "movq %1, %%mm4\n\t"\ | |
155 "movq %%mm4, %%mm5\n\t"\ | |
156 "punpckldq %%mm4, %%mm4\n\t"/*mm4 = D.re | D.re */\ | |
157 "punpckhdq %%mm5, %%mm5\n\t"/*mm5 = D.im | D.im */\ | |
158 "movq %0, %%mm0\n\t"\ | |
159 "pfmul %%mm0, %%mm4\n\t"/* mm4 =u.re | u.im */\ | |
160 "pfmul %%mm0, %%mm5\n\t"/* mm5 = a.re | a.im */\ | |
161 PSWAP_MM("%%mm5","%%mm3")\ | |
162 "pfmul %%mm7, %%mm5\n\t"\ | |
163 "pfadd %%mm5, %%mm4\n\t"/* mm4 = u*/\ | |
164 "movq %3, %%mm1\n\t"\ | |
165 "movq %2, %%mm0\n\t"\ | |
166 PSWAP_MM("%%mm1","%%mm3")\ | |
167 "movq %%mm0, %%mm2\n\t"\ | |
168 "pfmul %%mm1, %%mm0\n\t"/* mm0 = a*/\ | |
169 "pfmul %3, %%mm2\n\t"/* mm2 = v*/\ | |
170 PFNACC_MM("%%mm2","%%mm3")\ | |
171 "pfacc %%mm0, %%mm0\n\t"\ | |
926
d44a690543ac
Better insns scheduling and moving out local variables
nickols_k
parents:
920
diff
changeset
|
172 "movq %%mm4, %%mm5\n\t"\ |
920 | 173 "punpckldq %%mm0,%%mm2\n\t"/*mm2 = v.re | a.re*/\ |
926
d44a690543ac
Better insns scheduling and moving out local variables
nickols_k
parents:
920
diff
changeset
|
174 "pfmul %%mm6, %%mm5\n\t"\ |
920 | 175 "movq %%mm2, %%mm3\n\t"\ |
176 "pfmul %%mm7, %%mm3\n\t"\ | |
177 "pfadd %%mm3, %%mm5\n\t"\ | |
178 PSWAP_MM("%%mm5","%%mm3")/* mm5 = v*/\ | |
179 "pfadd %%mm2, %%mm4\n\t"\ | |
180 :\ | |
181 :"m"(WT), "m"(D), "m"(WB), "m"(D3)\ | |
182 :"memory");\ | |
183 __asm__ __volatile__(\ | |
184 "movq %4, %%mm0\n\t"/* a1 = A1*/\ | |
926
d44a690543ac
Better insns scheduling and moving out local variables
nickols_k
parents:
920
diff
changeset
|
185 "movq %5, %%mm2\n\t"/* a1 = A5*/\ |
920 | 186 "movq %%mm0, %%mm1\n\t"\ |
926
d44a690543ac
Better insns scheduling and moving out local variables
nickols_k
parents:
920
diff
changeset
|
187 "movq %%mm2, %%mm3\n\t"\ |
920 | 188 "pfadd %%mm4, %%mm0\n\t"/*A1 = a1 + u*/\ |
926
d44a690543ac
Better insns scheduling and moving out local variables
nickols_k
parents:
920
diff
changeset
|
189 "pfsub %%mm5, %%mm2\n\t"/*A5 = a1 - v*/\ |
920 | 190 "pfsub %%mm4, %%mm1\n\t"/*A9 = a1 - u*/\ |
926
d44a690543ac
Better insns scheduling and moving out local variables
nickols_k
parents:
920
diff
changeset
|
191 "pfadd %%mm5, %%mm3\n\t"/*A9 = a1 + v*/\ |
920 | 192 "movq %%mm0, %0\n\t"\ |
193 "movq %%mm1, %1\n\t"\ | |
194 "movq %%mm2, %2\n\t"\ | |
195 "movq %%mm3, %3"\ | |
196 :"=m"(A1), "=m"(A9), "=m"(A5), "=m"(A13)\ | |
926
d44a690543ac
Better insns scheduling and moving out local variables
nickols_k
parents:
920
diff
changeset
|
197 :"0"(A1), "2"(A5)\ |
920 | 198 :"memory");\ |
886 | 199 } |
200 | |
885 | 201 #endif |