Mercurial > mplayer.hg
annotate libao2/fir.h @ 4708:000ca7a19331
mem2agpcpy_pic()
author | michael |
---|---|
date | Thu, 14 Feb 2002 22:34:24 +0000 |
parents | 99dc749591e2 |
children | 534ef9323eca |
rev | line source |
---|---|
3631 | 1 /*============================================================================= |
2 // | |
4049 | 3 // This software has been released under the terms of the GNU Public |
4 // license. See http://www.gnu.org/copyleft/gpl.html for details. | |
3631 | 5 // |
6 // Copyright 2001 Anders Johansson ajh@atri.curtin.edu.au | |
7 // | |
8 //============================================================================= | |
9 */ | |
10 | |
11 #ifndef __FIR_H__ | |
12 #define __FIR_H__ | |
13 | |
14 /* 4, 8 and 16 tap FIR filters implemented using SSE instructions | |
15 int16_t* x Input data | |
16 int16_t* y Output value | |
17 int16_t* w Filter weights | |
18 | |
19 C function | |
20 for(int i = 0 ; i < L ; i++) | |
21 *y += w[i]*x[i]; | |
22 */ | |
23 | |
24 #ifdef HAVE_SSE | |
25 | |
26 // This block should be MMX only compatible, but it isn't... | |
27 #ifdef L4 | |
28 #define LOAD_QUE(x) \ | |
29 __asm __volatile("movq %0, %%mm2\n\t" \ | |
30 : \ | |
31 :"m"((x)[0]) \ | |
32 :"memory"); | |
33 #define SAVE_QUE(x) \ | |
34 __asm __volatile("movq %%mm2, %0\n\t" \ | |
4535 | 35 "emms \n\t" \ |
3631 | 36 :"=m"(x[0]) \ |
37 : \ | |
38 :"memory"); | |
39 #define UPDATE_QUE(in) \ | |
40 __asm __volatile("psllq $16, %%mm2\n\t" \ | |
41 "pinsrw $0, %0,%%mm2\n\t" \ | |
42 : \ | |
43 :"m" ((in)[0]) \ | |
44 :"memory"); | |
45 #define FIR(x,w,y) \ | |
46 __asm __volatile("movq %%mm2, %%mm0\n\t" \ | |
47 "pmaddwd %1, %%mm0\n\t" \ | |
48 "movq %%mm0, %%mm1\n\t" \ | |
49 "psrlq $32, %%mm1\n\t" \ | |
50 "paddd %%mm0, %%mm1\n\t" \ | |
51 "movd %%mm1, %%esi\n\t" \ | |
52 "shrl $16, %%esi\n\t" \ | |
53 "movw %%si, %0\n\t" \ | |
54 : "=m" ((y)[0]) \ | |
55 : "m" ((w)[0]) \ | |
56 : "memory", "%esi"); | |
57 #endif /* L4 */ | |
58 | |
59 // It is possible to make the 8 bit filter a lot faster by using the | |
60 // 128 bit registers, feel free to optimize. | |
61 #ifdef L8 | |
62 #define LOAD_QUE(x) \ | |
63 __asm __volatile("movq %0, %%mm5\n\t" \ | |
64 "movq %1, %%mm4\n\t" \ | |
65 : \ | |
66 :"m"((x)[0]), \ | |
67 "m"((x)[4]) \ | |
68 :"memory"); | |
69 #define SAVE_QUE(x) \ | |
70 __asm __volatile("movq %%mm5, %0\n\t" \ | |
71 "movq %%mm4, %1\n\t" \ | |
4535 | 72 "emms \n\t" \ |
3631 | 73 :"=m"((x)[0]), \ |
74 "=m"((x)[4]) \ | |
75 : \ | |
76 :"memory"); | |
77 | |
78 // Below operation could replace line 2 to 5 in macro below but can | |
79 // not cause of compiler bug ??? | |
80 // "pextrw $3, %%mm5,%%eax\n\t" | |
81 #define UPDATE_QUE(in) \ | |
82 __asm __volatile("psllq $16, %%mm4\n\t" \ | |
83 "movq %%mm5, %%mm0\n\t" \ | |
84 "psrlq $48, %%mm0\n\t" \ | |
85 "movd %%mm0, %%eax\n\t" \ | |
86 "pinsrw $0, %%eax,%%mm4\n\t" \ | |
87 "psllq $16, %%mm5\n\t" \ | |
88 "pinsrw $0, %0,%%mm5\n\t" \ | |
89 : \ | |
90 :"m" ((in)[0]) \ | |
91 :"memory", "%eax"); | |
92 #define FIR(x,w,y) \ | |
93 __asm __volatile("movq %%mm5, %%mm0\n\t" \ | |
94 "pmaddwd %1, %%mm0\n\t" \ | |
95 "movq %%mm4, %%mm1\n\t" \ | |
96 "pmaddwd %2, %%mm1\n\t" \ | |
97 "paddd %%mm1, %%mm0\n\t" \ | |
98 "movq %%mm0, %%mm1\n\t" \ | |
99 "psrlq $32, %%mm1\n\t" \ | |
100 "paddd %%mm0, %%mm1\n\t" \ | |
101 "movd %%mm1, %%esi\n\t" \ | |
102 "shrl $16, %%esi\n\t" \ | |
103 "movw %%si, %0\n\t" \ | |
104 : "=m" ((y)[0]) \ | |
105 : "m" ((w)[0]), \ | |
106 "m" ((w)[4]) \ | |
107 : "memory", "%esi"); | |
108 #endif /* L8 */ | |
109 | |
110 #else /* HAVE_SSE */ | |
111 | |
112 #define LOAD_QUE(x) | |
113 #define SAVE_QUE(x) | |
114 #define UPDATE_QUE(inm) \ | |
115 xi=(--xi)&(L-1); \ | |
4171
585f0c77d8f5
Sync problem when using fractional resampling fixed + speed increased.
anders
parents:
4049
diff
changeset
|
116 x[xi]=x[xi+L]=*(inm); |
3631 | 117 |
118 #ifdef L4 | |
119 #define FIR(x,w,y) \ | |
120 y[0]=(w[0]*x[0]+w[1]*x[1]+w[2]*x[2]+w[3]*x[3]) >> 16; | |
121 #else | |
122 #define FIR(x,w,y){ \ | |
123 int16_t a = (w[0]*x[0]+w[1]*x[1]+w[2]*x[2]+w[3]*x[3]) >> 16; \ | |
124 int16_t b = (w[4]*x[4]+w[5]*x[5]+w[6]*x[6]+w[7]*x[7]) >> 16; \ | |
125 y[0] = a+b; \ | |
126 } | |
127 #endif /* L4 */ | |
128 | |
129 #endif /* HAVE_SSE */ | |
130 | |
131 #endif /* __FIR_H__ */ | |
132 | |
133 |