changeset 3578:79759c05911e

mmx opt
author michael
date Tue, 18 Dec 2001 03:00:04 +0000
parents 6bf4dbfb941c
children 831860fada69
files liba52/resample.c liba52/resample_c.c liba52/resample_mmx.c
diffstat 3 files changed, 114 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/liba52/resample.c	Tue Dec 18 02:48:14 2001 +0000
+++ b/liba52/resample.c	Tue Dec 18 03:00:04 2001 +0000
@@ -311,6 +311,43 @@
 #endif
 	break;
     case A52_3F | A52_LFE:
+#ifdef HAVE_MMX
+	asm volatile(
+		"movl $-1024, %%esi		\n\t"
+		"movq magicF2W, %%mm7		\n\t"
+		"pxor %%mm6, %%mm6		\n\t"
+		"1:				\n\t"
+		"movq 1024(%1, %%esi), %%mm0	\n\t"
+		"movq 3072(%1, %%esi), %%mm1	\n\t"
+		"movq 2048(%1, %%esi), %%mm4	\n\t"
+		"movq (%1, %%esi), %%mm5	\n\t" 
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm7, %%mm4		\n\t"
+		"psubd %%mm7, %%mm5		\n\t"
+		"leal (%%esi, %%esi, 2), %%edi	\n\t"
+		
+		"packssdw %%mm4, %%mm0		\n\t" // EeAa
+		"packssdw %%mm5, %%mm1		\n\t" // FfBb
+		"movq %%mm0, %%mm2		\n\t" // EeAa
+		"punpcklwd %%mm1, %%mm0		\n\t" // BAba
+		"punpckhwd %%mm1, %%mm2		\n\t" // FEfe
+		"movq %%mm0, %%mm1		\n\t" // BAba
+		"punpckldq %%mm6, %%mm0		\n\t" // 00ba
+		"punpckhdq %%mm1, %%mm1		\n\t" // BABA
+		
+		"movq %%mm0, (%0, %%edi)	\n\t"
+		"punpckhdq %%mm2, %%mm0		\n\t" // FE00
+		"punpckldq %%mm1, %%mm2		\n\t" // BAfe
+		"movq %%mm2, 8(%0, %%edi)	\n\t"
+		"movq %%mm0, 16(%0, %%edi)	\n\t"
+		"addl $8, %%esi			\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+1536), "r" (f+256)
+		:"%esi", "%edi", "memory"
+	);
+#else
 	for (i = 0; i < 256; i++) {
 	    s16[6*i] = convert (f[i+256]);
 	    s16[6*i+1] = convert (f[i+768]);
@@ -318,6 +355,7 @@
 	    s16[6*i+4] = convert (f[i+512]);
 	    s16[6*i+5] = convert (f[i]);
 	}
+#endif	
 	break;
     case A52_2F2R | A52_LFE:
 #ifdef HAVE_MMX
--- a/liba52/resample_c.c	Tue Dec 18 02:48:14 2001 +0000
+++ b/liba52/resample_c.c	Tue Dec 18 03:00:04 2001 +0000
@@ -311,6 +311,43 @@
 #endif
 	break;
     case A52_3F | A52_LFE:
+#ifdef HAVE_MMX
+	asm volatile(
+		"movl $-1024, %%esi		\n\t"
+		"movq magicF2W, %%mm7		\n\t"
+		"pxor %%mm6, %%mm6		\n\t"
+		"1:				\n\t"
+		"movq 1024(%1, %%esi), %%mm0	\n\t"
+		"movq 3072(%1, %%esi), %%mm1	\n\t"
+		"movq 2048(%1, %%esi), %%mm4	\n\t"
+		"movq (%1, %%esi), %%mm5	\n\t" 
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm7, %%mm4		\n\t"
+		"psubd %%mm7, %%mm5		\n\t"
+		"leal (%%esi, %%esi, 2), %%edi	\n\t"
+		
+		"packssdw %%mm4, %%mm0		\n\t" // EeAa
+		"packssdw %%mm5, %%mm1		\n\t" // FfBb
+		"movq %%mm0, %%mm2		\n\t" // EeAa
+		"punpcklwd %%mm1, %%mm0		\n\t" // BAba
+		"punpckhwd %%mm1, %%mm2		\n\t" // FEfe
+		"movq %%mm0, %%mm1		\n\t" // BAba
+		"punpckldq %%mm6, %%mm0		\n\t" // 00ba
+		"punpckhdq %%mm1, %%mm1		\n\t" // BABA
+		
+		"movq %%mm0, (%0, %%edi)	\n\t"
+		"punpckhdq %%mm2, %%mm0		\n\t" // FE00
+		"punpckldq %%mm1, %%mm2		\n\t" // BAfe
+		"movq %%mm2, 8(%0, %%edi)	\n\t"
+		"movq %%mm0, 16(%0, %%edi)	\n\t"
+		"addl $8, %%esi			\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+1536), "r" (f+256)
+		:"%esi", "%edi", "memory"
+	);
+#else
 	for (i = 0; i < 256; i++) {
 	    s16[6*i] = convert (f[i+256]);
 	    s16[6*i+1] = convert (f[i+768]);
@@ -318,6 +355,7 @@
 	    s16[6*i+4] = convert (f[i+512]);
 	    s16[6*i+5] = convert (f[i]);
 	}
+#endif	
 	break;
     case A52_2F2R | A52_LFE:
 #ifdef HAVE_MMX
--- a/liba52/resample_mmx.c	Tue Dec 18 02:48:14 2001 +0000
+++ b/liba52/resample_mmx.c	Tue Dec 18 03:00:04 2001 +0000
@@ -311,6 +311,43 @@
 #endif
 	break;
     case A52_3F | A52_LFE:
+#ifdef HAVE_MMX
+	asm volatile(
+		"movl $-1024, %%esi		\n\t"
+		"movq magicF2W, %%mm7		\n\t"
+		"pxor %%mm6, %%mm6		\n\t"
+		"1:				\n\t"
+		"movq 1024(%1, %%esi), %%mm0	\n\t"
+		"movq 3072(%1, %%esi), %%mm1	\n\t"
+		"movq 2048(%1, %%esi), %%mm4	\n\t"
+		"movq (%1, %%esi), %%mm5	\n\t" 
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm7, %%mm4		\n\t"
+		"psubd %%mm7, %%mm5		\n\t"
+		"leal (%%esi, %%esi, 2), %%edi	\n\t"
+		
+		"packssdw %%mm4, %%mm0		\n\t" // EeAa
+		"packssdw %%mm5, %%mm1		\n\t" // FfBb
+		"movq %%mm0, %%mm2		\n\t" // EeAa
+		"punpcklwd %%mm1, %%mm0		\n\t" // BAba
+		"punpckhwd %%mm1, %%mm2		\n\t" // FEfe
+		"movq %%mm0, %%mm1		\n\t" // BAba
+		"punpckldq %%mm6, %%mm0		\n\t" // 00ba
+		"punpckhdq %%mm1, %%mm1		\n\t" // BABA
+		
+		"movq %%mm0, (%0, %%edi)	\n\t"
+		"punpckhdq %%mm2, %%mm0		\n\t" // FE00
+		"punpckldq %%mm1, %%mm2		\n\t" // BAfe
+		"movq %%mm2, 8(%0, %%edi)	\n\t"
+		"movq %%mm0, 16(%0, %%edi)	\n\t"
+		"addl $8, %%esi			\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+1536), "r" (f+256)
+		:"%esi", "%edi", "memory"
+	);
+#else
 	for (i = 0; i < 256; i++) {
 	    s16[6*i] = convert (f[i+256]);
 	    s16[6*i+1] = convert (f[i+768]);
@@ -318,6 +355,7 @@
 	    s16[6*i+4] = convert (f[i+512]);
 	    s16[6*i+5] = convert (f[i]);
 	}
+#endif	
 	break;
     case A52_2F2R | A52_LFE:
 #ifdef HAVE_MMX