view mp3lib/dct64_k7.c @ 5699:1dde9686d33b

Good evening ladies and gentleman and welcome to the latest installment of the ongoing show "Reworking the docs for fun and profit". Your host Diego will be assisted by Nilmoni in presenting you: - spellchecking in all its glory - a grammar to the envy of all native speakers - answers now hopefully so clear that their respective questions shall never be asked again Somebody from the public raises his voice: "What about HTML errors?" The host is quick to answer: "Yes, there have been corrections." From the back of the auditory comes a subdued question: "And the FONT tags..?" The room falls silent. There is no answer and the host twitches. Finally the words "They have not been touched." escape from his mouth, barely audible. A murmur erupts but the jury nods and calms the crowd "Time to get back to serious hacking.". The host leaves the stage under polite applause and everybody scuttles off for their notebooks...
author arpi
date Fri, 19 Apr 2002 07:30:49 +0000
parents 421969d55d5f
children d6eab895c742
line wrap: on
line source

/*
* This code was taken from http://www.mpg123.org
* See ChangeLog of mpg123-0.59s-pre.1 for detail
* Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
* Partial 3dnowex-DSP! optimization by Nick Kurshev
*
* TODO: optimize scalar 3dnow! code
* Warning: Phases 7 & 8 are not tested
*/
#define real float /* ugly - but only way */

#include "../mangle.h"

static unsigned long long int __attribute__((aligned(8))) x_plus_minus_3dnow = 0x8000000000000000ULL;
static float plus_1f = 1.0;

void __attribute__ (( __stdcall__ )) dct64_MMX_3dnowex(real *a,real *b,real *c)
{
  char tmp[256];
    __asm __volatile(
"	movl %2,%%eax\n\t"

"	leal 128+%3,%%edx\n\t"
"	movl %0,%%esi\n\t"
"	movl %1,%%edi\n\t"
"	movl $"MANGLE(costab_mmx)",%%ebx\n\t"
"	leal %3,%%ecx\n\t"

/* Phase 1*/
"	movq	(%%eax), %%mm0\n\t"
"	movq	8(%%eax), %%mm4\n\t"
"	movq	%%mm0, %%mm3\n\t"
"	movq	%%mm4, %%mm7\n\t"
"	movq	120(%%eax), %%mm1\n\t"
"	movq	112(%%eax), %%mm5\n\t"
"	pswapd	%%mm1, %%mm1\n\t"
"	pswapd	%%mm5, %%mm5\n\t"
"	pfadd	%%mm1, %%mm0\n\t"
"	pfadd	%%mm5, %%mm4\n\t"
"	movq	%%mm0, (%%edx)\n\t"
"	movq	%%mm4, 8(%%edx)\n\t"
"	pfsub	%%mm1, %%mm3\n\t"
"	pfsub	%%mm5, %%mm7\n\t"
"	pfmul	(%%ebx), %%mm3\n\t"
"	pfmul	8(%%ebx), %%mm7\n\t"
"	pswapd	%%mm3, %%mm3\n\t"
"	pswapd	%%mm7, %%mm7\n\t"
"	movq	%%mm3, 120(%%edx)\n\t"
"	movq	%%mm7, 112(%%edx)\n\t"

"	movq	16(%%eax), %%mm0\n\t"
"	movq	24(%%eax), %%mm4\n\t"
"	movq	%%mm0, %%mm3\n\t"
"	movq	%%mm4, %%mm7\n\t"
"	movq	104(%%eax), %%mm1\n\t"
"	movq	96(%%eax), %%mm5\n\t"
"	pswapd	%%mm1, %%mm1\n\t"
"	pswapd	%%mm5, %%mm5\n\t"
"	pfadd	%%mm1, %%mm0\n\t"
"	pfadd	%%mm5, %%mm4\n\t"
"	movq	%%mm0, 16(%%edx)\n\t"
"	movq	%%mm4, 24(%%edx)\n\t"
"	pfsub	%%mm1, %%mm3\n\t"
"	pfsub	%%mm5, %%mm7\n\t"
"	pfmul	16(%%ebx), %%mm3\n\t"
"	pfmul	24(%%ebx), %%mm7\n\t"
"	pswapd	%%mm3, %%mm3\n\t"
"	pswapd	%%mm7, %%mm7\n\t"
"	movq	%%mm3, 104(%%edx)\n\t"
"	movq	%%mm7, 96(%%edx)\n\t"

"	movq	32(%%eax), %%mm0\n\t"
"	movq	40(%%eax), %%mm4\n\t"
"	movq	%%mm0, %%mm3\n\t"
"	movq	%%mm4, %%mm7\n\t"
"	movq	88(%%eax), %%mm1\n\t"
"	movq	80(%%eax), %%mm5\n\t"
"	pswapd	%%mm1, %%mm1\n\t"
"	pswapd	%%mm5, %%mm5\n\t"
"	pfadd	%%mm1, %%mm0\n\t"
"	pfadd	%%mm5, %%mm4\n\t"
"	movq	%%mm0, 32(%%edx)\n\t"
"	movq	%%mm4, 40(%%edx)\n\t"
"	pfsub	%%mm1, %%mm3\n\t"
"	pfsub	%%mm5, %%mm7\n\t"
"	pfmul	32(%%ebx), %%mm3\n\t"
"	pfmul	40(%%ebx), %%mm7\n\t"
"	pswapd	%%mm3, %%mm3\n\t"
"	pswapd	%%mm7, %%mm7\n\t"
"	movq	%%mm3, 88(%%edx)\n\t"
"	movq	%%mm7, 80(%%edx)\n\t"

"	movq	48(%%eax), %%mm0\n\t"
"	movq	56(%%eax), %%mm4\n\t"
"	movq	%%mm0, %%mm3\n\t"
"	movq	%%mm4, %%mm7\n\t"
"	movq	72(%%eax), %%mm1\n\t"
"	movq	64(%%eax), %%mm5\n\t"
"	pswapd	%%mm1, %%mm1\n\t"
"	pswapd	%%mm5, %%mm5\n\t"
"	pfadd	%%mm1, %%mm0\n\t"
"	pfadd	%%mm5, %%mm4\n\t"
"	movq	%%mm0, 48(%%edx)\n\t"
"	movq	%%mm4, 56(%%edx)\n\t"
"	pfsub	%%mm1, %%mm3\n\t"
"	pfsub	%%mm5, %%mm7\n\t"
"	pfmul	48(%%ebx), %%mm3\n\t"
"	pfmul	56(%%ebx), %%mm7\n\t"
"	pswapd	%%mm3, %%mm3\n\t"
"	pswapd	%%mm7, %%mm7\n\t"
"	movq	%%mm3, 72(%%edx)\n\t"
"	movq	%%mm7, 64(%%edx)\n\t"

/* Phase 2*/

"	movq	(%%edx), %%mm0\n\t"
"	movq	8(%%edx), %%mm4\n\t"
"	movq	%%mm0, %%mm3\n\t"
"	movq	%%mm4, %%mm7\n\t"
"	movq	56(%%edx), %%mm1\n\t"
"	movq	48(%%edx), %%mm5\n\t"
"	pswapd	%%mm1, %%mm1\n\t"
"	pswapd	%%mm5, %%mm5\n\t"
"	pfadd	%%mm1, %%mm0\n\t"
"	pfadd	%%mm5, %%mm4\n\t"
"	movq	%%mm0, (%%ecx)\n\t"
"	movq	%%mm4, 8(%%ecx)\n\t"
"	pfsub	%%mm1, %%mm3\n\t"
"	pfsub	%%mm5, %%mm7\n\t"
"	pfmul	64(%%ebx), %%mm3\n\t"
"	pfmul	72(%%ebx), %%mm7\n\t"
"	pswapd	%%mm3, %%mm3\n\t"
"	pswapd	%%mm7, %%mm7\n\t"
"	movq	%%mm3, 56(%%ecx)\n\t"
"	movq	%%mm7, 48(%%ecx)\n\t"

"	movq	16(%%edx), %%mm0\n\t"
"	movq	24(%%edx), %%mm4\n\t"
"	movq	%%mm0, %%mm3\n\t"
"	movq	%%mm4, %%mm7\n\t"
"	movq	40(%%edx), %%mm1\n\t"
"	movq	32(%%edx), %%mm5\n\t"
"	pswapd	%%mm1, %%mm1\n\t"
"	pswapd	%%mm5, %%mm5\n\t"
"	pfadd	%%mm1, %%mm0\n\t"
"	pfadd	%%mm5, %%mm4\n\t"
"	movq	%%mm0, 16(%%ecx)\n\t"
"	movq	%%mm4, 24(%%ecx)\n\t"
"	pfsub	%%mm1, %%mm3\n\t"
"	pfsub	%%mm5, %%mm7\n\t"
"	pfmul	80(%%ebx), %%mm3\n\t"
"	pfmul	88(%%ebx), %%mm7\n\t"
"	pswapd	%%mm3, %%mm3\n\t"
"	pswapd	%%mm7, %%mm7\n\t"
"	movq	%%mm3, 40(%%ecx)\n\t"
"	movq	%%mm7, 32(%%ecx)\n\t"

/* Phase 3*/

"	movq	64(%%edx), %%mm0\n\t"
"	movq	72(%%edx), %%mm4\n\t"
"	movq	%%mm0, %%mm3\n\t"
"	movq	%%mm4, %%mm7\n\t"
"	movq	120(%%edx), %%mm1\n\t"
"	movq	112(%%edx), %%mm5\n\t"
"	pswapd	%%mm1, %%mm1\n\t"
"	pswapd	%%mm5, %%mm5\n\t"
"	pfadd	%%mm1, %%mm0\n\t"
"	pfadd	%%mm5, %%mm4\n\t"
"	movq	%%mm0, 64(%%ecx)\n\t"
"	movq	%%mm4, 72(%%ecx)\n\t"
"	pfsubr	%%mm1, %%mm3\n\t"
"	pfsubr	%%mm5, %%mm7\n\t"
"	pfmul	64(%%ebx), %%mm3\n\t"
"	pfmul	72(%%ebx), %%mm7\n\t"
"	pswapd	%%mm3, %%mm3\n\t"
"	pswapd	%%mm7, %%mm7\n\t"
"	movq	%%mm3, 120(%%ecx)\n\t"
"	movq	%%mm7, 112(%%ecx)\n\t"

"	movq	80(%%edx), %%mm0\n\t"
"	movq	88(%%edx), %%mm4\n\t"
"	movq	%%mm0, %%mm3\n\t"
"	movq	%%mm4, %%mm7\n\t"
"	movq	104(%%edx), %%mm1\n\t"
"	movq	96(%%edx), %%mm5\n\t"
"	pswapd	%%mm1, %%mm1\n\t"
"	pswapd	%%mm5, %%mm5\n\t"
"	pfadd	%%mm1, %%mm0\n\t"
"	pfadd	%%mm5, %%mm4\n\t"
"	movq	%%mm0, 80(%%ecx)\n\t"
"	movq	%%mm4, 88(%%ecx)\n\t"
"	pfsubr	%%mm1, %%mm3\n\t"
"	pfsubr	%%mm5, %%mm7\n\t"
"	pfmul	80(%%ebx), %%mm3\n\t"
"	pfmul	88(%%ebx), %%mm7\n\t"
"	pswapd	%%mm3, %%mm3\n\t"
"	pswapd	%%mm7, %%mm7\n\t"
"	movq	%%mm3, 104(%%ecx)\n\t"
"	movq	%%mm7, 96(%%ecx)\n\t"

/* Phase 4*/

"	movq	96(%%ebx), %%mm2\n\t"
"	movq	104(%%ebx), %%mm6\n\t"

"	movq	(%%ecx), %%mm0\n\t"
"	movq	8(%%ecx), %%mm4\n\t"
"	movq	%%mm0, %%mm3\n\t"
"	movq	%%mm4, %%mm7\n\t"
"	movq	24(%%ecx), %%mm1\n\t"
"	movq	16(%%ecx), %%mm5\n\t"
"	pswapd	%%mm1, %%mm1\n\t"
"	pswapd	%%mm5, %%mm5\n\t"
"	pfadd	%%mm1, %%mm0\n\t"
"	pfadd	%%mm5, %%mm4\n\t"
"	movq	%%mm0, (%%edx)\n\t"
"	movq	%%mm4, 8(%%edx)\n\t"
"	pfsub	%%mm1, %%mm3\n\t"
"	pfsub	%%mm5, %%mm7\n\t"
"	pfmul	%%mm2, %%mm3\n\t"
"	pfmul	%%mm6, %%mm7\n\t"
"	pswapd	%%mm3, %%mm3\n\t"
"	pswapd	%%mm7, %%mm7\n\t"
"	movq	%%mm3, 24(%%edx)\n\t"
"	movq	%%mm7, 16(%%edx)\n\t"

"	movq	32(%%ecx), %%mm0\n\t"
"	movq	40(%%ecx), %%mm4\n\t"
"	movq	%%mm0, %%mm3\n\t"
"	movq	%%mm4, %%mm7\n\t"
"	movq	56(%%ecx), %%mm1\n\t"
"	movq	48(%%ecx), %%mm5\n\t"
"	pswapd	%%mm1, %%mm1\n\t"
"	pswapd	%%mm5, %%mm5\n\t"
"	pfadd	%%mm1, %%mm0\n\t"
"	pfadd	%%mm5, %%mm4\n\t"
"	movq	%%mm0, 32(%%edx)\n\t"
"	movq	%%mm4, 40(%%edx)\n\t"
"	pfsubr	%%mm1, %%mm3\n\t"
"	pfsubr	%%mm5, %%mm7\n\t"
"	pfmul	%%mm2, %%mm3\n\t"
"	pfmul	%%mm6, %%mm7\n\t"
"	pswapd	%%mm3, %%mm3\n\t"
"	pswapd	%%mm7, %%mm7\n\t"
"	movq	%%mm3, 56(%%edx)\n\t"
"	movq	%%mm7, 48(%%edx)\n\t"

"	movq	64(%%ecx), %%mm0\n\t"
"	movq	72(%%ecx), %%mm4\n\t"
"	movq	%%mm0, %%mm3\n\t"
"	movq	%%mm4, %%mm7\n\t"
"	movq	88(%%ecx), %%mm1\n\t"
"	movq	80(%%ecx), %%mm5\n\t"
"	pswapd	%%mm1, %%mm1\n\t"
"	pswapd	%%mm5, %%mm5\n\t"
"	pfadd	%%mm1, %%mm0\n\t"
"	pfadd	%%mm5, %%mm4\n\t"
"	movq	%%mm0, 64(%%edx)\n\t"
"	movq	%%mm4, 72(%%edx)\n\t"
"	pfsub	%%mm1, %%mm3\n\t"
"	pfsub	%%mm5, %%mm7\n\t"
"	pfmul	%%mm2, %%mm3\n\t"
"	pfmul	%%mm6, %%mm7\n\t"
"	pswapd	%%mm3, %%mm3\n\t"
"	pswapd	%%mm7, %%mm7\n\t"
"	movq	%%mm3, 88(%%edx)\n\t"
"	movq	%%mm7, 80(%%edx)\n\t"

"	movq	96(%%ecx), %%mm0\n\t"
"	movq	104(%%ecx), %%mm4\n\t"
"	movq	%%mm0, %%mm3\n\t"
"	movq	%%mm4, %%mm7\n\t"
"	movq	120(%%ecx), %%mm1\n\t"
"	movq	112(%%ecx), %%mm5\n\t"
"	pswapd	%%mm1, %%mm1\n\t"
"	pswapd	%%mm5, %%mm5\n\t"
"	pfadd	%%mm1, %%mm0\n\t"
"	pfadd	%%mm5, %%mm4\n\t"
"	movq	%%mm0, 96(%%edx)\n\t"
"	movq	%%mm4, 104(%%edx)\n\t"
"	pfsubr	%%mm1, %%mm3\n\t"
"	pfsubr	%%mm5, %%mm7\n\t"
"	pfmul	%%mm2, %%mm3\n\t"
"	pfmul	%%mm6, %%mm7\n\t"
"	pswapd	%%mm3, %%mm3\n\t"
"	pswapd	%%mm7, %%mm7\n\t"
"	movq	%%mm3, 120(%%edx)\n\t"
"	movq	%%mm7, 112(%%edx)\n\t"

/* Phase 5 */

"	movq	112(%%ebx), %%mm2\n\t"

"	movq	(%%edx), %%mm0\n\t"
"	movq	16(%%edx), %%mm4\n\t"
"	movq	%%mm0, %%mm3\n\t"
"	movq	%%mm4, %%mm7\n\t"
"	movq	8(%%edx), %%mm1\n\t"
"	movq	24(%%edx), %%mm5\n\t"
"	pswapd	%%mm1, %%mm1\n\t"
"	pswapd	%%mm5, %%mm5\n\t"
"	pfadd	%%mm1, %%mm0\n\t"
"	pfadd	%%mm5, %%mm4\n\t"
"	movq	%%mm0, (%%ecx)\n\t"
"	movq	%%mm4, 16(%%ecx)\n\t"
"	pfsub	%%mm1, %%mm3\n\t"
"	pfsubr	%%mm5, %%mm7\n\t"
"	pfmul	%%mm2, %%mm3\n\t"
"	pfmul	%%mm2, %%mm7\n\t"
"	pswapd	%%mm3, %%mm3\n\t"
"	pswapd	%%mm7, %%mm7\n\t"
"	movq	%%mm3, 8(%%ecx)\n\t"
"	movq	%%mm7, 24(%%ecx)\n\t"

"	movq	32(%%edx), %%mm0\n\t"
"	movq	48(%%edx), %%mm4\n\t"
"	movq	%%mm0, %%mm3\n\t"
"	movq	%%mm4, %%mm7\n\t"
"	movq	40(%%edx), %%mm1\n\t"
"	movq	56(%%edx), %%mm5\n\t"
"	pswapd	%%mm1, %%mm1\n\t"
"	pswapd	%%mm5, %%mm5\n\t"
"	pfadd	%%mm1, %%mm0\n\t"
"	pfadd	%%mm5, %%mm4\n\t"
"	movq	%%mm0, 32(%%ecx)\n\t"
"	movq	%%mm4, 48(%%ecx)\n\t"
"	pfsub	%%mm1, %%mm3\n\t"
"	pfsubr	%%mm5, %%mm7\n\t"
"	pfmul	%%mm2, %%mm3\n\t"
"	pfmul	%%mm2, %%mm7\n\t"
"	pswapd	%%mm3, %%mm3\n\t"
"	pswapd	%%mm7, %%mm7\n\t"
"	movq	%%mm3, 40(%%ecx)\n\t"
"	movq	%%mm7, 56(%%ecx)\n\t"

"	movq	64(%%edx), %%mm0\n\t"
"	movq	80(%%edx), %%mm4\n\t"
"	movq	%%mm0, %%mm3\n\t"
"	movq	%%mm4, %%mm7\n\t"
"	movq	72(%%edx), %%mm1\n\t"
"	movq	88(%%edx), %%mm5\n\t"
"	pswapd	%%mm1, %%mm1\n\t"
"	pswapd	%%mm5, %%mm5\n\t"
"	pfadd	%%mm1, %%mm0\n\t"
"	pfadd	%%mm5, %%mm4\n\t"
"	movq	%%mm0, 64(%%ecx)\n\t"
"	movq	%%mm4, 80(%%ecx)\n\t"
"	pfsub	%%mm1, %%mm3\n\t"
"	pfsubr	%%mm5, %%mm7\n\t"
"	pfmul	%%mm2, %%mm3\n\t"
"	pfmul	%%mm2, %%mm7\n\t"
"	pswapd	%%mm3, %%mm3\n\t"
"	pswapd	%%mm7, %%mm7\n\t"
"	movq	%%mm3, 72(%%ecx)\n\t"
"	movq	%%mm7, 88(%%ecx)\n\t"

"	movq	96(%%edx), %%mm0\n\t"
"	movq	112(%%edx), %%mm4\n\t"
"	movq	%%mm0, %%mm3\n\t"
"	movq	%%mm4, %%mm7\n\t"
"	movq	104(%%edx), %%mm1\n\t"
"	movq	120(%%edx), %%mm5\n\t"
"	pswapd	%%mm1, %%mm1\n\t"
"	pswapd	%%mm5, %%mm5\n\t"
"	pfadd	%%mm1, %%mm0\n\t"
"	pfadd	%%mm5, %%mm4\n\t"
"	movq	%%mm0, 96(%%ecx)\n\t"
"	movq	%%mm4, 112(%%ecx)\n\t"
"	pfsub	%%mm1, %%mm3\n\t"
"	pfsubr	%%mm5, %%mm7\n\t"
"	pfmul	%%mm2, %%mm3\n\t"
"	pfmul	%%mm2, %%mm7\n\t"
"	pswapd	%%mm3, %%mm3\n\t"
"	pswapd	%%mm7, %%mm7\n\t"
"	movq	%%mm3, 104(%%ecx)\n\t"
"	movq	%%mm7, 120(%%ecx)\n\t"


/* Phase 6. This is the end of easy road. */
/* Code below is coded in scalar mode. Should be optimized */

"	movd	"MANGLE(plus_1f)", %%mm6\n\t"
"	punpckldq 120(%%ebx), %%mm6\n\t"      /* mm6 = 1.0 | 120(%%ebx)*/
"	movq	x_plus_minus_3dnow, %%mm7\n\t" /* mm7 = +1 | -1 */

"	movq	32(%%ecx), %%mm0\n\t"
"	movq	64(%%ecx), %%mm2\n\t"
"	movq	%%mm0, %%mm1\n\t"
"	movq	%%mm2, %%mm3\n\t"
"	pxor	%%mm7, %%mm1\n\t"
"	pxor	%%mm7, %%mm3\n\t"
"	pfacc	%%mm1, %%mm0\n\t"
"	pfacc	%%mm3, %%mm2\n\t"
"	pfmul	%%mm6, %%mm0\n\t"
"	pfmul	%%mm6, %%mm2\n\t"
"	movq	%%mm0, 32(%%edx)\n\t"
"	movq	%%mm2, 64(%%edx)\n\t"

"	movd	44(%%ecx), %%mm0\n\t"
"	movd	40(%%ecx), %%mm2\n\t"
"	movd	120(%%ebx), %%mm3\n\t"
"	punpckldq 76(%%ecx), %%mm0\n\t"
"	punpckldq 72(%%ecx), %%mm2\n\t"
"	punpckldq %%mm3, %%mm3\n\t"
"	movq	%%mm0, %%mm4\n\t"
"	movq	%%mm2, %%mm5\n\t"
"	pfsub	%%mm2, %%mm0\n\t"
"	pfmul	%%mm3, %%mm0\n\t"
"	movq	%%mm0, %%mm1\n\t"
"	pfadd	%%mm5, %%mm0\n\t"
"	pfadd	%%mm4, %%mm0\n\t"
"	movq	%%mm0, %%mm2\n\t"
"	punpckldq %%mm1, %%mm0\n\t"
"	punpckhdq %%mm1, %%mm2\n\t"
"	movq	%%mm0, 40(%%edx)\n\t"
"	movq	%%mm2, 72(%%edx)\n\t"

"	movd   48(%%ecx), %%mm3\n\t"
"	movd   60(%%ecx), %%mm2\n\t"
"	pfsub  52(%%ecx), %%mm3\n\t"
"	pfsub  56(%%ecx), %%mm2\n\t"
"	pfmul 120(%%ebx), %%mm3\n\t"
"	pfmul 120(%%ebx), %%mm2\n\t"
"	movq	%%mm2, %%mm1\n\t"

"	pfadd  56(%%ecx), %%mm1\n\t"
"	pfadd  60(%%ecx), %%mm1\n\t"
"	movq	%%mm1, %%mm0\n\t"

"	pfadd  48(%%ecx), %%mm0\n\t"
"	pfadd  52(%%ecx), %%mm0\n\t"
"	pfadd	%%mm3, %%mm1\n\t"
"	punpckldq %%mm2, %%mm1\n\t"
"	pfadd	%%mm3, %%mm2\n\t"
"	punpckldq %%mm2, %%mm0\n\t"
"	movq	%%mm1, 56(%%edx)\n\t"
"	movq	%%mm0, 48(%%edx)\n\t"

/*---*/

"	movd   92(%%ecx), %%mm1\n\t"
"	pfsub  88(%%ecx), %%mm1\n\t"
"	pfmul 120(%%ebx), %%mm1\n\t"
"	movd   %%mm1, 92(%%edx)\n\t"
"	pfadd  92(%%ecx), %%mm1\n\t"
"	pfadd  88(%%ecx), %%mm1\n\t"
"	movq   %%mm1, %%mm0\n\t"

"	pfadd  80(%%ecx), %%mm0\n\t"
"	pfadd  84(%%ecx), %%mm0\n\t"
"	movd   %%mm0, 80(%%edx)\n\t"

"	movd   80(%%ecx), %%mm0\n\t"
"	pfsub  84(%%ecx), %%mm0\n\t"
"	pfmul 120(%%ebx), %%mm0\n\t"
"	pfadd  %%mm0, %%mm1\n\t"
"	pfadd  92(%%edx), %%mm0\n\t"
"	punpckldq %%mm1, %%mm0\n\t"
"	movq   %%mm0, 84(%%edx)\n\t"

"	movq	96(%%ecx), %%mm0\n\t"
"	movq	%%mm0, %%mm1\n\t"
"	pxor	%%mm7, %%mm1\n\t"
"	pfacc	%%mm1, %%mm0\n\t"
"	pfmul	%%mm6, %%mm0\n\t"
"	movq	%%mm0, 96(%%edx)\n\t"

"	movd  108(%%ecx), %%mm0\n\t"
"	pfsub 104(%%ecx), %%mm0\n\t"
"	pfmul 120(%%ebx), %%mm0\n\t"
"	movd  %%mm0, 108(%%edx)\n\t"
"	pfadd 104(%%ecx), %%mm0\n\t"
"	pfadd 108(%%ecx), %%mm0\n\t"
"	movd  %%mm0, 104(%%edx)\n\t"

"	movd  124(%%ecx), %%mm1\n\t"
"	pfsub 120(%%ecx), %%mm1\n\t"
"	pfmul 120(%%ebx), %%mm1\n\t"
"	movd  %%mm1, 124(%%edx)\n\t"
"	pfadd 120(%%ecx), %%mm1\n\t"
"	pfadd 124(%%ecx), %%mm1\n\t"
"	movq  %%mm1, %%mm0\n\t"

"	pfadd 112(%%ecx), %%mm0\n\t"
"	pfadd 116(%%ecx), %%mm0\n\t"
"	movd  %%mm0, 112(%%edx)\n\t"

"	movd  112(%%ecx), %%mm0\n\t"
"	pfsub 116(%%ecx), %%mm0\n\t"
"	pfmul 120(%%ebx), %%mm0\n\t"
"	pfadd %%mm0,%%mm1\n\t"
"	pfadd 124(%%edx), %%mm0\n\t"
"	punpckldq %%mm1, %%mm0\n\t"
"	movq  %%mm0, 116(%%edx)\n\t"

"	jnz .L01\n\t"

/* Phase 7*/
/* Code below is coded in scalar mode. Should be optimized */

"	movd      (%%ecx), %%mm0\n\t"
"	pfadd    4(%%ecx), %%mm0\n\t"
"	movd     %%mm0, 1024(%%esi)\n\t"

"	movd      (%%ecx), %%mm0\n\t"
"	pfsub    4(%%ecx), %%mm0\n\t"
"	pfmul  120(%%ebx), %%mm0\n\t"
"	movd      %%mm0, (%%esi)\n\t"
"	movd      %%mm0, (%%edi)\n\t"

"	movd   12(%%ecx), %%mm0\n\t"
"	pfsub   8(%%ecx), %%mm0\n\t"
"	pfmul 120(%%ebx), %%mm0\n\t"
"	movd    %%mm0, 512(%%edi)\n\t"
"	pfadd   12(%%ecx), %%mm0\n\t"
"	pfadd   8(%%ecx), %%mm0\n\t"
"	movd    %%mm0, 512(%%esi)\n\t"

"	movd   16(%%ecx), %%mm0\n\t"
"	pfsub  20(%%ecx), %%mm0\n\t"
"	pfmul 120(%%ebx), %%mm0\n\t"
"	movq	%%mm0, %%mm3\n\t"

"	movd   28(%%ecx), %%mm0\n\t"
"	pfsub  24(%%ecx), %%mm0\n\t"
"	pfmul 120(%%ebx), %%mm0\n\t"
"	movd    %%mm0, 768(%%edi)\n\t"
"	movq	%%mm0, %%mm2\n\t"

"	pfadd  24(%%ecx), %%mm0\n\t"
"	pfadd  28(%%ecx), %%mm0\n\t"
"	movq	%%mm0, %%mm1\n\t"

"	pfadd  16(%%ecx), %%mm0\n\t"
"	pfadd  20(%%ecx), %%mm0\n\t"
"	movd   %%mm0, 768(%%esi)\n\t"
"	pfadd  %%mm3, %%mm1\n\t"
"	movd   %%mm1, 256(%%esi)\n\t"
"	pfadd  %%mm3, %%mm2\n\t"
"	movd   %%mm2, 256(%%edi)\n\t"

/* Phase 8*/

"	movq   32(%%edx), %%mm0\n\t"
"	movq   48(%%edx), %%mm1\n\t"
"	pfadd  48(%%edx), %%mm0\n\t"
"	pfadd  40(%%edx), %%mm1\n\t"
"	movd   %%mm0, 896(%%esi)\n\t"
"	movd   %%mm1, 640(%%esi)\n\t"
"	psrlq  $32, %%mm0\n\t"
"	psrlq  $32, %%mm1\n\t"
"	movd   %%mm0, 128(%%edi)\n\t"
"	movd   %%mm1, 384(%%edi)\n\t"

"	movd   40(%%edx), %%mm0\n\t"
"	pfadd  56(%%edx), %%mm0\n\t"
"	movd   %%mm0, 384(%%esi)\n\t"

"	movd   56(%%edx), %%mm0\n\t"
"	pfadd  36(%%edx), %%mm0\n\t"
"	movd   %%mm0, 128(%%esi)\n\t"

"	movd   60(%%edx), %%mm0\n\t"
"	movd   %%mm0, 896(%%edi)\n\t"
"	pfadd  44(%%edx), %%mm0\n\t"
"	movd   %%mm0, 640(%%edi)\n\t"

"	movq   96(%%edx), %%mm0\n\t"
"	movq   112(%%edx), %%mm2\n\t"
"	movq   104(%%edx), %%mm4\n\t"
"	pfadd  112(%%edx), %%mm0\n\t"
"	pfadd  104(%%edx), %%mm2\n\t"
"	pfadd  120(%%edx), %%mm4\n\t"
"	movq   %%mm0, %%mm1\n\t"
"	movq   %%mm2, %%mm3\n\t"
"	movq   %%mm4, %%mm5\n\t"
"	pfadd  64(%%edx), %%mm0\n\t"
"	pfadd  80(%%edx), %%mm2\n\t"
"	pfadd  72(%%edx), %%mm4\n\t"
"	movd   %%mm0, 960(%%esi)\n\t"
"	movd   %%mm2, 704(%%esi)\n\t"
"	movd   %%mm4, 448(%%esi)\n\t"
"	psrlq  $32, %%mm0\n\t"
"	psrlq  $32, %%mm2\n\t"
"	psrlq  $32, %%mm4\n\t"
"	movd   %%mm0, 64(%%edi)\n\t"
"	movd   %%mm2, 320(%%edi)\n\t"
"	movd   %%mm4, 576(%%edi)\n\t"
"	pfadd  80(%%edx), %%mm1\n\t"
"	pfadd  72(%%edx), %%mm3\n\t"
"	pfadd  88(%%edx), %%mm5\n\t"
"	movd   %%mm1, 832(%%esi)\n\t"
"	movd   %%mm3, 576(%%esi)\n\t"
"	movd   %%mm5, 320(%%esi)\n\t"
"	psrlq  $32, %%mm1\n\t"
"	psrlq  $32, %%mm3\n\t"
"	psrlq  $32, %%mm5\n\t"
"	movd   %%mm1, 192(%%edi)\n\t"
"	movd   %%mm3, 448(%%edi)\n\t"
"	movd   %%mm5, 704(%%edi)\n\t"

"	movd   120(%%edx), %%mm0\n\t"
"	pfadd  100(%%edx), %%mm0\n\t"
"	movq   %%mm0, %%mm1\n\t"
"	pfadd  88(%%edx), %%mm0\n\t"
"	movd   %%mm0, 192(%%esi)\n\t"
"	pfadd  68(%%edx), %%mm1\n\t"
"	movd   %%mm1, 64(%%esi)\n\t"

"	movd  124(%%edx), %%mm0\n\t"
"	movd  %%mm0, 960(%%edi)\n\t"
"	pfadd  92(%%edx), %%mm0\n\t"
"	movd  %%mm0, 832(%%edi)\n\t"

"	jmp	.L_bye\n\t"
".L01:	\n\t"
/* Phase 9*/

"	movq	(%%ecx), %%mm0\n\t"
"	movq	%%mm0, %%mm1\n\t"
"	pxor    %%mm7, %%mm1\n\t"
"	pfacc	%%mm1, %%mm0\n\t"
"	pfmul	%%mm6, %%mm0\n\t"
"	pf2id	%%mm0, %%mm0\n\t"
"	movd	%%mm0, %%eax\n\t"
"	movw    %%ax, 512(%%esi)\n\t"
"	psrlq	$32, %%mm0\n\t"
"	movd	%%mm0, %%eax\n\t"
"	movw    %%ax, (%%esi)\n\t"

"	movd    12(%%ecx), %%mm0\n\t"
"	pfsub    8(%%ecx), %%mm0\n\t"
"	pfmul  120(%%ebx), %%mm0\n\t"
"	pf2id    %%mm0, %%mm7\n\t"
"	movd	 %%mm7, %%eax\n\t"
"	movw     %%ax, 256(%%edi)\n\t"
"	pfadd   12(%%ecx), %%mm0\n\t"
"	pfadd    8(%%ecx), %%mm0\n\t"
"	pf2id    %%mm0, %%mm0\n\t"
"	movd	 %%mm0, %%eax\n\t"
"	movw     %%ax, 256(%%esi)\n\t"

"	movd   16(%%ecx), %%mm3\n\t"
"	pfsub  20(%%ecx), %%mm3\n\t"
"	pfmul  120(%%ebx), %%mm3\n\t"
"	movq   %%mm3, %%mm2\n\t"

"	movd   28(%%ecx), %%mm2\n\t"
"	pfsub  24(%%ecx), %%mm2\n\t"
"	pfmul 120(%%ebx), %%mm2\n\t"
"	movq   %%mm2, %%mm1\n\t"

"	pf2id  %%mm2, %%mm7\n\t"
"	movd   %%mm7, %%eax\n\t"
"	movw   %%ax, 384(%%edi)\n\t"

"	pfadd  24(%%ecx), %%mm1\n\t"
"	pfadd  28(%%ecx), %%mm1\n\t"
"	movq   %%mm1, %%mm0\n\t"

"	pfadd  16(%%ecx), %%mm0\n\t"
"	pfadd  20(%%ecx), %%mm0\n\t"
"	pf2id  %%mm0, %%mm0\n\t"
"	movd   %%mm0, %%eax\n\t"
"	movw   %%ax, 384(%%esi)\n\t"
"	pfadd  %%mm3, %%mm1\n\t"
"	pf2id  %%mm1, %%mm1\n\t"
"	movd   %%mm1, %%eax\n\t"
"	movw   %%ax, 128(%%esi)\n\t"
"	pfadd  %%mm3, %%mm2\n\t"
"	pf2id  %%mm2, %%mm2\n\t"
"	movd   %%mm2, %%eax\n\t"
"	movw   %%ax, 128(%%edi)\n\t"

/* Phase 10*/

"	movq    32(%%edx), %%mm0\n\t"
"	movq    48(%%edx), %%mm1\n\t"
"	pfadd   48(%%edx), %%mm0\n\t"
"	pfadd   40(%%edx), %%mm1\n\t"
"	pf2id   %%mm0, %%mm0\n\t"
"	pf2id   %%mm1, %%mm1\n\t"
"	movd	%%mm0, %%eax\n\t"
"	movd	%%mm1, %%ecx\n\t"
"	movw    %%ax, 448(%%esi)\n\t"
"	movw    %%cx, 320(%%esi)\n\t"
"	psrlq   $32, %%mm0\n\t"
"	psrlq   $32, %%mm1\n\t"
"	movd	%%mm0, %%eax\n\t"
"	movd	%%mm1, %%ecx\n\t"
"	movw    %%ax, 64(%%edi)\n\t"
"	movw    %%cx, 192(%%edi)\n\t"

"	movd   40(%%edx), %%mm3\n\t"
"	movd   56(%%edx), %%mm4\n\t"
"	movd   60(%%edx), %%mm0\n\t"
"	movd   44(%%edx), %%mm2\n\t"
"	movd  120(%%edx), %%mm5\n\t"
"	punpckldq %%mm4, %%mm3\n\t"
"	punpckldq 124(%%edx), %%mm0\n\t"
"	pfadd 100(%%edx), %%mm5\n\t"
"	punpckldq 36(%%edx), %%mm4\n\t"
"	punpckldq 92(%%edx), %%mm2\n\t"
"	movq  %%mm5, %%mm6\n\t"
"	pfadd  %%mm4, %%mm3\n\t"
"	pf2id  %%mm0, %%mm1\n\t"
"	pf2id  %%mm3, %%mm3\n\t"
"	pfadd  88(%%edx), %%mm5\n\t"
"	movd   %%mm1, %%eax\n\t"
"	movd   %%mm3, %%ecx\n\t"
"	movw   %%ax, 448(%%edi)\n\t"
"	movw   %%cx, 192(%%esi)\n\t"
"	pf2id  %%mm5, %%mm5\n\t"
"	psrlq  $32, %%mm1\n\t"
"       psrlq  $32, %%mm3\n\t"
"	movd   %%mm5, %%ebx\n\t"
"	movd   %%mm1, %%eax\n\t"
"	movd   %%mm3, %%ecx\n\t"
"	movw   %%bx, 96(%%esi)\n\t"
"	movw   %%ax, 480(%%edi)\n\t"
"	movw   %%cx, 64(%%esi)\n\t"
"	pfadd  %%mm2, %%mm0\n\t"
"	pf2id  %%mm0, %%mm0\n\t"
"	movd   %%mm0, %%eax\n\t"
"	pfadd  68(%%edx), %%mm6\n\t"
"	movw   %%ax, 320(%%edi)\n\t"
"	psrlq  $32, %%mm0\n\t"
"	pf2id  %%mm6, %%mm6\n\t"
"	movd   %%mm0, %%eax\n\t"
"	movd   %%mm6, %%ebx\n\t"
"	movw   %%ax, 416(%%edi)\n\t"
"	movw   %%bx, 32(%%esi)\n\t"

"	movq   96(%%edx), %%mm0\n\t"
"	movq  112(%%edx), %%mm2\n\t"
"	movq  104(%%edx), %%mm4\n\t"
"	pfadd %%mm2, %%mm0\n\t"
"	pfadd %%mm4, %%mm2\n\t"
"	pfadd 120(%%edx), %%mm4\n\t"
"	movq  %%mm0, %%mm1\n\t"
"	movq  %%mm2, %%mm3\n\t"
"	movq  %%mm4, %%mm5\n\t"
"	pfadd  64(%%edx), %%mm0\n\t"
"	pfadd  80(%%edx), %%mm2\n\t"
"	pfadd  72(%%edx), %%mm4\n\t"
"	pf2id  %%mm0, %%mm0\n\t"
"	pf2id  %%mm2, %%mm2\n\t"
"	pf2id  %%mm4, %%mm4\n\t"
"	movd   %%mm0, %%eax\n\t"
"	movd   %%mm2, %%ecx\n\t"
"	movd   %%mm4, %%ebx\n\t"
"	movw   %%ax, 480(%%esi)\n\t"
"	movw   %%cx, 352(%%esi)\n\t"
"	movw   %%bx, 224(%%esi)\n\t"
"	psrlq  $32, %%mm0\n\t"
"	psrlq  $32, %%mm2\n\t"
"	psrlq  $32, %%mm4\n\t"
"	movd   %%mm0, %%eax\n\t"
"	movd   %%mm2, %%ecx\n\t"
"	movd   %%mm4, %%ebx\n\t"
"	movw   %%ax, 32(%%edi)\n\t"
"	movw   %%cx, 160(%%edi)\n\t"
"	movw   %%bx, 288(%%edi)\n\t"
"	pfadd  80(%%edx), %%mm1\n\t"
"	pfadd  72(%%edx), %%mm3\n\t"
"	pfadd  88(%%edx), %%mm5\n\t"
"	pf2id  %%mm1, %%mm1\n\t"
"	pf2id  %%mm3, %%mm3\n\t"
"	pf2id  %%mm5, %%mm5\n\t"
"	movd   %%mm1, %%eax\n\t"
"	movd   %%mm3, %%ecx\n\t"
"	movd   %%mm5, %%ebx\n\t"
"	movw   %%ax, 416(%%esi)\n\t"
"	movw   %%cx, 288(%%esi)\n\t"
"	movw   %%bx, 160(%%esi)\n\t"
"	psrlq  $32, %%mm1\n\t"
"	psrlq  $32, %%mm3\n\t"
"	psrlq  $32, %%mm5\n\t"
"	movd   %%mm1, %%eax\n\t"
"	movd   %%mm3, %%ecx\n\t"
"	movd   %%mm5, %%ebx\n\t"
"	movw   %%ax, 96(%%edi)\n\t"
"	movw   %%cx, 224(%%edi)\n\t"
"	movw   %%bx, 352(%%edi)\n\t"

"	movsw\n\t"

".L_bye:\n\t"
"	femms\n\t"
	:
	:"m"(a),"m"(b),"m"(c),"m"(tmp[0])
	:"memory","%ebx","%esi","%edi");
}