Mercurial > mplayer.hg
diff libao2/fir.h @ 4725:534ef9323eca
MMX part rewritten and 16 tap filter added for better sound qualty
author | anders |
---|---|
date | Sat, 16 Feb 2002 13:08:14 +0000 |
parents | 99dc749591e2 |
children | c2bb05709676 |
line wrap: on
line diff
--- a/libao2/fir.h Sat Feb 16 13:06:45 2002 +0000 +++ b/libao2/fir.h Sat Feb 16 13:08:14 2002 +0000 @@ -11,123 +11,75 @@ #ifndef __FIR_H__ #define __FIR_H__ -/* 4, 8 and 16 tap FIR filters implemented using SSE instructions - int16_t* x Input data - int16_t* y Output value - int16_t* w Filter weights - - C function - for(int i = 0 ; i < L ; i++) - *y += w[i]*x[i]; -*/ - -#ifdef HAVE_SSE +/* Fixpoint 16 bit fir filter FIR filter. The filter is implemented +both in C and MMX assembly. The filter consists of one macro +UPDATE_QUE and one inline function firn. The macro can be used for +adding new data to the circular buffer used by the filter firn. +Limitations: max length of n = 16*4 and n must be multiple of 4 (pad +fiter with zeros for other lengths). Sometimes it works with filters +longer than 4*16 (the problem is overshoot and the acumulated energy +in the filter taps). */ -// This block should be MMX only compatible, but it isn't... -#ifdef L4 -#define LOAD_QUE(x) \ - __asm __volatile("movq %0, %%mm2\n\t" \ - : \ - :"m"((x)[0]) \ - :"memory"); -#define SAVE_QUE(x) \ - __asm __volatile("movq %%mm2, %0\n\t" \ - "emms \n\t" \ - :"=m"(x[0]) \ - : \ - :"memory"); -#define UPDATE_QUE(in) \ - __asm __volatile("psllq $16, %%mm2\n\t" \ - "pinsrw $0, %0,%%mm2\n\t" \ - : \ - :"m" ((in)[0]) \ - :"memory"); -#define FIR(x,w,y) \ - __asm __volatile("movq %%mm2, %%mm0\n\t" \ - "pmaddwd %1, %%mm0\n\t" \ - "movq %%mm0, %%mm1\n\t" \ - "psrlq $32, %%mm1\n\t" \ - "paddd %%mm0, %%mm1\n\t" \ - "movd %%mm1, %%esi\n\t" \ - "shrl $16, %%esi\n\t" \ - "movw %%si, %0\n\t" \ - : "=m" ((y)[0]) \ - : "m" ((w)[0]) \ - : "memory", "%esi"); -#endif /* L4 */ +#ifdef HAVE_MMX +inline int32_t firn(int16_t* x, int16_t* w, int16_t n) +{ + register int32_t y; // Output + // Prologue + asm volatile(" pxor %mm1, %mm1;\n" ); // Clear buffer yt + // Main loop + while((n-=4)>=0){ + asm volatile( + " movq (%1), %%mm0;\n" // Load x(n:n+4) + " pmaddwd (%0), %%mm0;\n" // yt(n:n+1)=sum(x(n:n+4).*w(n:n+4)) + " psrld $16, %%mm0;\n" // yt(n:n+1)=yt(n:n+1)>>16 + " paddd %%mm0, %%mm1;\n" // yt(n:n+1)=yt(n-2:n-1)+yt(n:n+1) + :: "r" (w), "r" (x)); + w+=4; x+=4; + } + // Epilogue + asm volatile( + " movq %%mm1, %%mm0;\n" + " punpckhdq %%mm1, %%mm0;\n" + " paddd %%mm0, %%mm1;\n" //yt(n)=yt(n)+yt(n+1) + " movd %%mm1, %0 ;\n" //y=yt + " emms ;\n" + : "=&r" (y)); + return y; +} -// It is possible to make the 8 bit filter a lot faster by using the -// 128 bit registers, feel free to optimize. +#else /* HAVE_MMX */ + +// Same thing as above but in C +inline int32_t firn(int16_t* x, int16_t* w, int16_t n) +{ + register int32_t y=0; + while((n-=4) >=0) + y+=w[n]*x[n]+w[n+1]*x[n+1]+w[n+2]*x[n+2]+w[n+3]*x[n+3] >> 16; + return y; +} + +#endif /* HAVE_MMX */ + +// Macro to add new data to circular queue +#define UPDATE_QUE(ind,xq,xid) \ + xid=(--xid)&(L-1); \ + xq[xid]=xq[xid+L]=*(ind); + #ifdef L8 -#define LOAD_QUE(x) \ - __asm __volatile("movq %0, %%mm5\n\t" \ - "movq %1, %%mm4\n\t" \ - : \ - :"m"((x)[0]), \ - "m"((x)[4]) \ - :"memory"); -#define SAVE_QUE(x) \ - __asm __volatile("movq %%mm5, %0\n\t" \ - "movq %%mm4, %1\n\t" \ - "emms \n\t" \ - :"=m"((x)[0]), \ - "=m"((x)[4]) \ - : \ - :"memory"); - -// Below operation could replace line 2 to 5 in macro below but can -// not cause of compiler bug ??? -// "pextrw $3, %%mm5,%%eax\n\t" -#define UPDATE_QUE(in) \ - __asm __volatile("psllq $16, %%mm4\n\t" \ - "movq %%mm5, %%mm0\n\t" \ - "psrlq $48, %%mm0\n\t" \ - "movd %%mm0, %%eax\n\t" \ - "pinsrw $0, %%eax,%%mm4\n\t" \ - "psllq $16, %%mm5\n\t" \ - "pinsrw $0, %0,%%mm5\n\t" \ - : \ - :"m" ((in)[0]) \ - :"memory", "%eax"); -#define FIR(x,w,y) \ - __asm __volatile("movq %%mm5, %%mm0\n\t" \ - "pmaddwd %1, %%mm0\n\t" \ - "movq %%mm4, %%mm1\n\t" \ - "pmaddwd %2, %%mm1\n\t" \ - "paddd %%mm1, %%mm0\n\t" \ - "movq %%mm0, %%mm1\n\t" \ - "psrlq $32, %%mm1\n\t" \ - "paddd %%mm0, %%mm1\n\t" \ - "movd %%mm1, %%esi\n\t" \ - "shrl $16, %%esi\n\t" \ - "movw %%si, %0\n\t" \ - : "=m" ((y)[0]) \ - : "m" ((w)[0]), \ - "m" ((w)[4]) \ - : "memory", "%esi"); -#endif /* L8 */ - -#else /* HAVE_SSE */ - -#define LOAD_QUE(x) -#define SAVE_QUE(x) -#define UPDATE_QUE(inm) \ - xi=(--xi)&(L-1); \ - x[xi]=x[xi+L]=*(inm); - -#ifdef L4 -#define FIR(x,w,y) \ - y[0]=(w[0]*x[0]+w[1]*x[1]+w[2]*x[2]+w[3]*x[3]) >> 16; -#else +#ifdef HAVE_MMX +#define FIR(x,w,y) *y=(int16_t)firn(x,w,8); +#else /* HAVE_MMX */ +// Unrolled loop to speed up execution #define FIR(x,w,y){ \ int16_t a = (w[0]*x[0]+w[1]*x[1]+w[2]*x[2]+w[3]*x[3]) >> 16; \ int16_t b = (w[4]*x[4]+w[5]*x[5]+w[6]*x[6]+w[7]*x[7]) >> 16; \ y[0] = a+b; \ } -#endif /* L4 */ +#endif /* HAVE_MMX */ +#endif /* L8 */ -#endif /* HAVE_SSE */ +#ifdef L16 +#define FIR(x,w,y) *y=(int16_t)firn(x,w,16); +#endif /* L16 */ #endif /* __FIR_H__ */ - -