changeset 3575:01a2466e035e

mmx opt
author michael
date Tue, 18 Dec 2001 01:46:41 +0000
parents 8600f40003de
children c282fd9e8534
files liba52/resample.c liba52/resample_c.c liba52/resample_mmx.c
diffstat 3 files changed, 138 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/liba52/resample.c	Tue Dec 18 00:56:08 2001 +0000
+++ b/liba52/resample.c	Tue Dec 18 01:46:41 2001 +0000
@@ -294,6 +294,51 @@
 	}
 	break;
     case A52_3F2R | A52_LFE:
+#ifdef HAVE_MMX
+	asm volatile(
+		"movl $-1024, %%esi		\n\t"
+		"movq magicF2W, %%mm7		\n\t"
+//		"pxor %%mm6, %%mm6		\n\t"
+		"1:				\n\t"
+		"movq 1024(%1, %%esi), %%mm0	\n\t"
+		"movq 3072(%1, %%esi), %%mm1	\n\t"
+		"movq 4096(%1, %%esi), %%mm2	\n\t"
+		"movq 5120(%1, %%esi), %%mm3	\n\t"
+		"movq 2048(%1, %%esi), %%mm4	\n\t"
+		"movq (%1, %%esi), %%mm5	\n\t" 
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm7, %%mm2		\n\t"
+		"psubd %%mm7, %%mm3		\n\t"
+		"psubd %%mm7, %%mm4		\n\t"
+		"psubd %%mm7, %%mm5		\n\t"
+		"leal (%%esi, %%esi, 2), %%edi	\n\t"
+		
+		"packssdw %%mm2, %%mm0		\n\t" // CcAa
+		"packssdw %%mm3, %%mm1		\n\t" // DdBb
+		"packssdw %%mm4, %%mm4		\n\t" // EeEe
+		"packssdw %%mm5, %%mm5		\n\t" // FfFf
+		"movq %%mm0, %%mm2		\n\t" // CcAa
+		"punpcklwd %%mm1, %%mm0		\n\t" // BAba
+		"punpckhwd %%mm1, %%mm2		\n\t" // DCdc
+		"punpcklwd %%mm5, %%mm4		\n\t" // FEfe
+		"movq %%mm0, %%mm1		\n\t" // BAba
+		"movq %%mm4, %%mm3		\n\t" // FEfe
+		"punpckldq %%mm2, %%mm0		\n\t" // dcba
+		"punpckhdq %%mm1, %%mm1		\n\t" // BABA
+		"punpckldq %%mm1, %%mm4		\n\t" // BAfe
+		"punpckhdq %%mm3, %%mm2		\n\t" // FEDC
+		
+		"movq %%mm0, (%0, %%edi)	\n\t"
+		"movq %%mm4, 8(%0, %%edi)	\n\t"
+		"movq %%mm2, 16(%0, %%edi)	\n\t"
+		"addl $8, %%esi			\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+1536), "r" (f+256)
+		:"%esi", "%edi", "memory"
+	);
+#else
 	for (i = 0; i < 256; i++) {
 	    s16[6*i] = convert (f[i+256]);
 	    s16[6*i+1] = convert (f[i+768]);
@@ -302,6 +347,7 @@
 	    s16[6*i+4] = convert (f[i+512]);
 	    s16[6*i+5] = convert (f[i]);
 	}
+#endif	
 	break;
     }
     return chans*256;
--- a/liba52/resample_c.c	Tue Dec 18 00:56:08 2001 +0000
+++ b/liba52/resample_c.c	Tue Dec 18 01:46:41 2001 +0000
@@ -294,6 +294,51 @@
 	}
 	break;
     case A52_3F2R | A52_LFE:
+#ifdef HAVE_MMX
+	asm volatile(
+		"movl $-1024, %%esi		\n\t"
+		"movq magicF2W, %%mm7		\n\t"
+//		"pxor %%mm6, %%mm6		\n\t"
+		"1:				\n\t"
+		"movq 1024(%1, %%esi), %%mm0	\n\t"
+		"movq 3072(%1, %%esi), %%mm1	\n\t"
+		"movq 4096(%1, %%esi), %%mm2	\n\t"
+		"movq 5120(%1, %%esi), %%mm3	\n\t"
+		"movq 2048(%1, %%esi), %%mm4	\n\t"
+		"movq (%1, %%esi), %%mm5	\n\t" 
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm7, %%mm2		\n\t"
+		"psubd %%mm7, %%mm3		\n\t"
+		"psubd %%mm7, %%mm4		\n\t"
+		"psubd %%mm7, %%mm5		\n\t"
+		"leal (%%esi, %%esi, 2), %%edi	\n\t"
+		
+		"packssdw %%mm2, %%mm0		\n\t" // CcAa
+		"packssdw %%mm3, %%mm1		\n\t" // DdBb
+		"packssdw %%mm4, %%mm4		\n\t" // EeEe
+		"packssdw %%mm5, %%mm5		\n\t" // FfFf
+		"movq %%mm0, %%mm2		\n\t" // CcAa
+		"punpcklwd %%mm1, %%mm0		\n\t" // BAba
+		"punpckhwd %%mm1, %%mm2		\n\t" // DCdc
+		"punpcklwd %%mm5, %%mm4		\n\t" // FEfe
+		"movq %%mm0, %%mm1		\n\t" // BAba
+		"movq %%mm4, %%mm3		\n\t" // FEfe
+		"punpckldq %%mm2, %%mm0		\n\t" // dcba
+		"punpckhdq %%mm1, %%mm1		\n\t" // BABA
+		"punpckldq %%mm1, %%mm4		\n\t" // BAfe
+		"punpckhdq %%mm3, %%mm2		\n\t" // FEDC
+		
+		"movq %%mm0, (%0, %%edi)	\n\t"
+		"movq %%mm4, 8(%0, %%edi)	\n\t"
+		"movq %%mm2, 16(%0, %%edi)	\n\t"
+		"addl $8, %%esi			\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+1536), "r" (f+256)
+		:"%esi", "%edi", "memory"
+	);
+#else
 	for (i = 0; i < 256; i++) {
 	    s16[6*i] = convert (f[i+256]);
 	    s16[6*i+1] = convert (f[i+768]);
@@ -302,6 +347,7 @@
 	    s16[6*i+4] = convert (f[i+512]);
 	    s16[6*i+5] = convert (f[i]);
 	}
+#endif	
 	break;
     }
     return chans*256;
--- a/liba52/resample_mmx.c	Tue Dec 18 00:56:08 2001 +0000
+++ b/liba52/resample_mmx.c	Tue Dec 18 01:46:41 2001 +0000
@@ -294,6 +294,51 @@
 	}
 	break;
     case A52_3F2R | A52_LFE:
+#ifdef HAVE_MMX
+	asm volatile(
+		"movl $-1024, %%esi		\n\t"
+		"movq magicF2W, %%mm7		\n\t"
+//		"pxor %%mm6, %%mm6		\n\t"
+		"1:				\n\t"
+		"movq 1024(%1, %%esi), %%mm0	\n\t"
+		"movq 3072(%1, %%esi), %%mm1	\n\t"
+		"movq 4096(%1, %%esi), %%mm2	\n\t"
+		"movq 5120(%1, %%esi), %%mm3	\n\t"
+		"movq 2048(%1, %%esi), %%mm4	\n\t"
+		"movq (%1, %%esi), %%mm5	\n\t" 
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm7, %%mm2		\n\t"
+		"psubd %%mm7, %%mm3		\n\t"
+		"psubd %%mm7, %%mm4		\n\t"
+		"psubd %%mm7, %%mm5		\n\t"
+		"leal (%%esi, %%esi, 2), %%edi	\n\t"
+		
+		"packssdw %%mm2, %%mm0		\n\t" // CcAa
+		"packssdw %%mm3, %%mm1		\n\t" // DdBb
+		"packssdw %%mm4, %%mm4		\n\t" // EeEe
+		"packssdw %%mm5, %%mm5		\n\t" // FfFf
+		"movq %%mm0, %%mm2		\n\t" // CcAa
+		"punpcklwd %%mm1, %%mm0		\n\t" // BAba
+		"punpckhwd %%mm1, %%mm2		\n\t" // DCdc
+		"punpcklwd %%mm5, %%mm4		\n\t" // FEfe
+		"movq %%mm0, %%mm1		\n\t" // BAba
+		"movq %%mm4, %%mm3		\n\t" // FEfe
+		"punpckldq %%mm2, %%mm0		\n\t" // dcba
+		"punpckhdq %%mm1, %%mm1		\n\t" // BABA
+		"punpckldq %%mm1, %%mm4		\n\t" // BAfe
+		"punpckhdq %%mm3, %%mm2		\n\t" // FEDC
+		
+		"movq %%mm0, (%0, %%edi)	\n\t"
+		"movq %%mm4, 8(%0, %%edi)	\n\t"
+		"movq %%mm2, 16(%0, %%edi)	\n\t"
+		"addl $8, %%esi			\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+1536), "r" (f+256)
+		:"%esi", "%edi", "memory"
+	);
+#else
 	for (i = 0; i < 256; i++) {
 	    s16[6*i] = convert (f[i+256]);
 	    s16[6*i+1] = convert (f[i+768]);
@@ -302,6 +347,7 @@
 	    s16[6*i+4] = convert (f[i+512]);
 	    s16[6*i+5] = convert (f[i]);
 	}
+#endif	
 	break;
     }
     return chans*256;