view mp3lib/dct64_k7.c @ 31173:b35751576d17

cosmetics: Drop pointless _s suffix from 'struct mp_image'.
author diego
date Thu, 27 May 2010 10:08:30 +0000
parents 0ad2da052b2e
children d0f70692a140
line wrap: on
line source

/*
* This code was taken from http://www.mpg123.org
* See ChangeLog of mpg123-0.59s-pre.1 for detail
* Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
* Partial 3dnowex-DSP! optimization by Nick Kurshev
*
* TODO: optimize scalar 3dnow! code
* Warning: Phases 7 & 8 are not tested
*/

#include "config.h"
#include "mangle.h"
#include "mpg123.h"

static unsigned long long int attribute_used __attribute__((aligned(8))) x_plus_minus_3dnow = 0x8000000000000000ULL;
static float attribute_used plus_1f = 1.0;

void dct64_MMX_3dnowex(short *a,short *b,real *c)
{
  char tmp[256];
    __asm__ volatile(
"       movl %2,%%eax\n\t"

"       leal 128+%3,%%edx\n\t"
"       movl %0,%%esi\n\t"
"       movl %1,%%edi\n\t"
"       movl $"MANGLE(costab_mmx)",%%ebx\n\t"
"       leal %3,%%ecx\n\t"

/* Phase 1*/
"       movq    (%%eax), %%mm0\n\t"
"       movq    8(%%eax), %%mm4\n\t"
"       movq    %%mm0, %%mm3\n\t"
"       movq    %%mm4, %%mm7\n\t"
"       pswapd  120(%%eax), %%mm1\n\t"
"       pswapd  112(%%eax), %%mm5\n\t"
"       pfadd   %%mm1, %%mm0\n\t"
"       pfadd   %%mm5, %%mm4\n\t"
"       movq    %%mm0, (%%edx)\n\t"
"       movq    %%mm4, 8(%%edx)\n\t"
"       pfsub   %%mm1, %%mm3\n\t"
"       pfsub   %%mm5, %%mm7\n\t"
"       pfmul   (%%ebx), %%mm3\n\t"
"       pfmul   8(%%ebx), %%mm7\n\t"
"       pswapd  %%mm3, %%mm3\n\t"
"       pswapd  %%mm7, %%mm7\n\t"
"       movq    %%mm3, 120(%%edx)\n\t"
"       movq    %%mm7, 112(%%edx)\n\t"

"       movq    16(%%eax), %%mm0\n\t"
"       movq    24(%%eax), %%mm4\n\t"
"       movq    %%mm0, %%mm3\n\t"
"       movq    %%mm4, %%mm7\n\t"
"       pswapd  104(%%eax), %%mm1\n\t"
"       pswapd  96(%%eax), %%mm5\n\t"
"       pfadd   %%mm1, %%mm0\n\t"
"       pfadd   %%mm5, %%mm4\n\t"
"       movq    %%mm0, 16(%%edx)\n\t"
"       movq    %%mm4, 24(%%edx)\n\t"
"       pfsub   %%mm1, %%mm3\n\t"
"       pfsub   %%mm5, %%mm7\n\t"
"       pfmul   16(%%ebx), %%mm3\n\t"
"       pfmul   24(%%ebx), %%mm7\n\t"
"       pswapd  %%mm3, %%mm3\n\t"
"       pswapd  %%mm7, %%mm7\n\t"
"       movq    %%mm3, 104(%%edx)\n\t"
"       movq    %%mm7, 96(%%edx)\n\t"

"       movq    32(%%eax), %%mm0\n\t"
"       movq    40(%%eax), %%mm4\n\t"
"       movq    %%mm0, %%mm3\n\t"
"       movq    %%mm4, %%mm7\n\t"
"       pswapd  88(%%eax), %%mm1\n\t"
"       pswapd  80(%%eax), %%mm5\n\t"
"       pfadd   %%mm1, %%mm0\n\t"
"       pfadd   %%mm5, %%mm4\n\t"
"       movq    %%mm0, 32(%%edx)\n\t"
"       movq    %%mm4, 40(%%edx)\n\t"
"       pfsub   %%mm1, %%mm3\n\t"
"       pfsub   %%mm5, %%mm7\n\t"
"       pfmul   32(%%ebx), %%mm3\n\t"
"       pfmul   40(%%ebx), %%mm7\n\t"
"       pswapd  %%mm3, %%mm3\n\t"
"       pswapd  %%mm7, %%mm7\n\t"
"       movq    %%mm3, 88(%%edx)\n\t"
"       movq    %%mm7, 80(%%edx)\n\t"

"       movq    48(%%eax), %%mm0\n\t"
"       movq    56(%%eax), %%mm4\n\t"
"       movq    %%mm0, %%mm3\n\t"
"       movq    %%mm4, %%mm7\n\t"
"       pswapd  72(%%eax), %%mm1\n\t"
"       pswapd  64(%%eax), %%mm5\n\t"
"       pfadd   %%mm1, %%mm0\n\t"
"       pfadd   %%mm5, %%mm4\n\t"
"       movq    %%mm0, 48(%%edx)\n\t"
"       movq    %%mm4, 56(%%edx)\n\t"
"       pfsub   %%mm1, %%mm3\n\t"
"       pfsub   %%mm5, %%mm7\n\t"
"       pfmul   48(%%ebx), %%mm3\n\t"
"       pfmul   56(%%ebx), %%mm7\n\t"
"       pswapd  %%mm3, %%mm3\n\t"
"       pswapd  %%mm7, %%mm7\n\t"
"       movq    %%mm3, 72(%%edx)\n\t"
"       movq    %%mm7, 64(%%edx)\n\t"

/* Phase 2*/

"       movq    (%%edx), %%mm0\n\t"
"       movq    8(%%edx), %%mm4\n\t"
"       movq    %%mm0, %%mm3\n\t"
"       movq    %%mm4, %%mm7\n\t"
"       pswapd  56(%%edx), %%mm1\n\t"
"       pswapd  48(%%edx), %%mm5\n\t"
"       pfadd   %%mm1, %%mm0\n\t"
"       pfadd   %%mm5, %%mm4\n\t"
"       movq    %%mm0, (%%ecx)\n\t"
"       movq    %%mm4, 8(%%ecx)\n\t"
"       pfsub   %%mm1, %%mm3\n\t"
"       pfsub   %%mm5, %%mm7\n\t"
"       pfmul   64(%%ebx), %%mm3\n\t"
"       pfmul   72(%%ebx), %%mm7\n\t"
"       pswapd  %%mm3, %%mm3\n\t"
"       pswapd  %%mm7, %%mm7\n\t"
"       movq    %%mm3, 56(%%ecx)\n\t"
"       movq    %%mm7, 48(%%ecx)\n\t"

"       movq    16(%%edx), %%mm0\n\t"
"       movq    24(%%edx), %%mm4\n\t"
"       movq    %%mm0, %%mm3\n\t"
"       movq    %%mm4, %%mm7\n\t"
"       pswapd  40(%%edx), %%mm1\n\t"
"       pswapd  32(%%edx), %%mm5\n\t"
"       pfadd   %%mm1, %%mm0\n\t"
"       pfadd   %%mm5, %%mm4\n\t"
"       movq    %%mm0, 16(%%ecx)\n\t"
"       movq    %%mm4, 24(%%ecx)\n\t"
"       pfsub   %%mm1, %%mm3\n\t"
"       pfsub   %%mm5, %%mm7\n\t"
"       pfmul   80(%%ebx), %%mm3\n\t"
"       pfmul   88(%%ebx), %%mm7\n\t"
"       pswapd  %%mm3, %%mm3\n\t"
"       pswapd  %%mm7, %%mm7\n\t"
"       movq    %%mm3, 40(%%ecx)\n\t"
"       movq    %%mm7, 32(%%ecx)\n\t"

/* Phase 3*/

"       movq    64(%%edx), %%mm0\n\t"
"       movq    72(%%edx), %%mm4\n\t"
"       movq    %%mm0, %%mm3\n\t"
"       movq    %%mm4, %%mm7\n\t"
"       pswapd  120(%%edx), %%mm1\n\t"
"       pswapd  112(%%edx), %%mm5\n\t"
"       pfadd   %%mm1, %%mm0\n\t"
"       pfadd   %%mm5, %%mm4\n\t"
"       movq    %%mm0, 64(%%ecx)\n\t"
"       movq    %%mm4, 72(%%ecx)\n\t"
"       pfsubr  %%mm1, %%mm3\n\t"
"       pfsubr  %%mm5, %%mm7\n\t"
"       pfmul   64(%%ebx), %%mm3\n\t"
"       pfmul   72(%%ebx), %%mm7\n\t"
"       pswapd  %%mm3, %%mm3\n\t"
"       pswapd  %%mm7, %%mm7\n\t"
"       movq    %%mm3, 120(%%ecx)\n\t"
"       movq    %%mm7, 112(%%ecx)\n\t"

"       movq    80(%%edx), %%mm0\n\t"
"       movq    88(%%edx), %%mm4\n\t"
"       movq    %%mm0, %%mm3\n\t"
"       movq    %%mm4, %%mm7\n\t"
"       pswapd  104(%%edx), %%mm1\n\t"
"       pswapd  96(%%edx), %%mm5\n\t"
"       pfadd   %%mm1, %%mm0\n\t"
"       pfadd   %%mm5, %%mm4\n\t"
"       movq    %%mm0, 80(%%ecx)\n\t"
"       movq    %%mm4, 88(%%ecx)\n\t"
"       pfsubr  %%mm1, %%mm3\n\t"
"       pfsubr  %%mm5, %%mm7\n\t"
"       pfmul   80(%%ebx), %%mm3\n\t"
"       pfmul   88(%%ebx), %%mm7\n\t"
"       pswapd  %%mm3, %%mm3\n\t"
"       pswapd  %%mm7, %%mm7\n\t"
"       movq    %%mm3, 104(%%ecx)\n\t"
"       movq    %%mm7, 96(%%ecx)\n\t"

/* Phase 4*/

"       movq    96(%%ebx), %%mm2\n\t"
"       movq    104(%%ebx), %%mm6\n\t"

"       movq    (%%ecx), %%mm0\n\t"
"       movq    8(%%ecx), %%mm4\n\t"
"       movq    %%mm0, %%mm3\n\t"
"       movq    %%mm4, %%mm7\n\t"
"       pswapd  24(%%ecx), %%mm1\n\t"
"       pswapd  16(%%ecx), %%mm5\n\t"
"       pfadd   %%mm1, %%mm0\n\t"
"       pfadd   %%mm5, %%mm4\n\t"
"       movq    %%mm0, (%%edx)\n\t"
"       movq    %%mm4, 8(%%edx)\n\t"
"       pfsub   %%mm1, %%mm3\n\t"
"       pfsub   %%mm5, %%mm7\n\t"
"       pfmul   %%mm2, %%mm3\n\t"
"       pfmul   %%mm6, %%mm7\n\t"
"       pswapd  %%mm3, %%mm3\n\t"
"       pswapd  %%mm7, %%mm7\n\t"
"       movq    %%mm3, 24(%%edx)\n\t"
"       movq    %%mm7, 16(%%edx)\n\t"

"       movq    32(%%ecx), %%mm0\n\t"
"       movq    40(%%ecx), %%mm4\n\t"
"       movq    %%mm0, %%mm3\n\t"
"       movq    %%mm4, %%mm7\n\t"
"       pswapd  56(%%ecx), %%mm1\n\t"
"       pswapd  48(%%ecx), %%mm5\n\t"
"       pfadd   %%mm1, %%mm0\n\t"
"       pfadd   %%mm5, %%mm4\n\t"
"       movq    %%mm0, 32(%%edx)\n\t"
"       movq    %%mm4, 40(%%edx)\n\t"
"       pfsubr  %%mm1, %%mm3\n\t"
"       pfsubr  %%mm5, %%mm7\n\t"
"       pfmul   %%mm2, %%mm3\n\t"
"       pfmul   %%mm6, %%mm7\n\t"
"       pswapd  %%mm3, %%mm3\n\t"
"       pswapd  %%mm7, %%mm7\n\t"
"       movq    %%mm3, 56(%%edx)\n\t"
"       movq    %%mm7, 48(%%edx)\n\t"

"       movq    64(%%ecx), %%mm0\n\t"
"       movq    72(%%ecx), %%mm4\n\t"
"       movq    %%mm0, %%mm3\n\t"
"       movq    %%mm4, %%mm7\n\t"
"       pswapd  88(%%ecx), %%mm1\n\t"
"       pswapd  80(%%ecx), %%mm5\n\t"
"       pfadd   %%mm1, %%mm0\n\t"
"       pfadd   %%mm5, %%mm4\n\t"
"       movq    %%mm0, 64(%%edx)\n\t"
"       movq    %%mm4, 72(%%edx)\n\t"
"       pfsub   %%mm1, %%mm3\n\t"
"       pfsub   %%mm5, %%mm7\n\t"
"       pfmul   %%mm2, %%mm3\n\t"
"       pfmul   %%mm6, %%mm7\n\t"
"       pswapd  %%mm3, %%mm3\n\t"
"       pswapd  %%mm7, %%mm7\n\t"
"       movq    %%mm3, 88(%%edx)\n\t"
"       movq    %%mm7, 80(%%edx)\n\t"

"       movq    96(%%ecx), %%mm0\n\t"
"       movq    104(%%ecx), %%mm4\n\t"
"       movq    %%mm0, %%mm3\n\t"
"       movq    %%mm4, %%mm7\n\t"
"       pswapd  120(%%ecx), %%mm1\n\t"
"       pswapd  112(%%ecx), %%mm5\n\t"
"       pfadd   %%mm1, %%mm0\n\t"
"       pfadd   %%mm5, %%mm4\n\t"
"       movq    %%mm0, 96(%%edx)\n\t"
"       movq    %%mm4, 104(%%edx)\n\t"
"       pfsubr  %%mm1, %%mm3\n\t"
"       pfsubr  %%mm5, %%mm7\n\t"
"       pfmul   %%mm2, %%mm3\n\t"
"       pfmul   %%mm6, %%mm7\n\t"
"       pswapd  %%mm3, %%mm3\n\t"
"       pswapd  %%mm7, %%mm7\n\t"
"       movq    %%mm3, 120(%%edx)\n\t"
"       movq    %%mm7, 112(%%edx)\n\t"

/* Phase 5 */

"       movq    112(%%ebx), %%mm2\n\t"

"       movq    (%%edx), %%mm0\n\t"
"       movq    16(%%edx), %%mm4\n\t"
"       movq    %%mm0, %%mm3\n\t"
"       movq    %%mm4, %%mm7\n\t"
"       pswapd  8(%%edx), %%mm1\n\t"
"       pswapd  24(%%edx), %%mm5\n\t"
"       pfadd   %%mm1, %%mm0\n\t"
"       pfadd   %%mm5, %%mm4\n\t"
"       movq    %%mm0, (%%ecx)\n\t"
"       movq    %%mm4, 16(%%ecx)\n\t"
"       pfsub   %%mm1, %%mm3\n\t"
"       pfsubr  %%mm5, %%mm7\n\t"
"       pfmul   %%mm2, %%mm3\n\t"
"       pfmul   %%mm2, %%mm7\n\t"
"       pswapd  %%mm3, %%mm3\n\t"
"       pswapd  %%mm7, %%mm7\n\t"
"       movq    %%mm3, 8(%%ecx)\n\t"
"       movq    %%mm7, 24(%%ecx)\n\t"

"       movq    32(%%edx), %%mm0\n\t"
"       movq    48(%%edx), %%mm4\n\t"
"       movq    %%mm0, %%mm3\n\t"
"       movq    %%mm4, %%mm7\n\t"
"       pswapd  40(%%edx), %%mm1\n\t"
"       pswapd  56(%%edx), %%mm5\n\t"
"       pfadd   %%mm1, %%mm0\n\t"
"       pfadd   %%mm5, %%mm4\n\t"
"       movq    %%mm0, 32(%%ecx)\n\t"
"       movq    %%mm4, 48(%%ecx)\n\t"
"       pfsub   %%mm1, %%mm3\n\t"
"       pfsubr  %%mm5, %%mm7\n\t"
"       pfmul   %%mm2, %%mm3\n\t"
"       pfmul   %%mm2, %%mm7\n\t"
"       pswapd  %%mm3, %%mm3\n\t"
"       pswapd  %%mm7, %%mm7\n\t"
"       movq    %%mm3, 40(%%ecx)\n\t"
"       movq    %%mm7, 56(%%ecx)\n\t"

"       movq    64(%%edx), %%mm0\n\t"
"       movq    80(%%edx), %%mm4\n\t"
"       movq    %%mm0, %%mm3\n\t"
"       movq    %%mm4, %%mm7\n\t"
"       pswapd  72(%%edx), %%mm1\n\t"
"       pswapd  88(%%edx), %%mm5\n\t"
"       pfadd   %%mm1, %%mm0\n\t"
"       pfadd   %%mm5, %%mm4\n\t"
"       movq    %%mm0, 64(%%ecx)\n\t"
"       movq    %%mm4, 80(%%ecx)\n\t"
"       pfsub   %%mm1, %%mm3\n\t"
"       pfsubr  %%mm5, %%mm7\n\t"
"       pfmul   %%mm2, %%mm3\n\t"
"       pfmul   %%mm2, %%mm7\n\t"
"       pswapd  %%mm3, %%mm3\n\t"
"       pswapd  %%mm7, %%mm7\n\t"
"       movq    %%mm3, 72(%%ecx)\n\t"
"       movq    %%mm7, 88(%%ecx)\n\t"

"       movq    96(%%edx), %%mm0\n\t"
"       movq    112(%%edx), %%mm4\n\t"
"       movq    %%mm0, %%mm3\n\t"
"       movq    %%mm4, %%mm7\n\t"
"       pswapd  104(%%edx), %%mm1\n\t"
"       pswapd  120(%%edx), %%mm5\n\t"
"       pfadd   %%mm1, %%mm0\n\t"
"       pfadd   %%mm5, %%mm4\n\t"
"       movq    %%mm0, 96(%%ecx)\n\t"
"       movq    %%mm4, 112(%%ecx)\n\t"
"       pfsub   %%mm1, %%mm3\n\t"
"       pfsubr  %%mm5, %%mm7\n\t"
"       pfmul   %%mm2, %%mm3\n\t"
"       pfmul   %%mm2, %%mm7\n\t"
"       pswapd  %%mm3, %%mm3\n\t"
"       pswapd  %%mm7, %%mm7\n\t"
"       movq    %%mm3, 104(%%ecx)\n\t"
"       movq    %%mm7, 120(%%ecx)\n\t"


/* Phase 6. This is the end of easy road. */
/* Code below is coded in scalar mode. Should be optimized */

"       movd    "MANGLE(plus_1f)", %%mm6\n\t"
"       punpckldq 120(%%ebx), %%mm6\n\t"      /* mm6 = 1.0 | 120(%%ebx)*/
"       movq    "MANGLE(x_plus_minus_3dnow)", %%mm7\n\t" /* mm7 = +1 | -1 */

"       movq    32(%%ecx), %%mm0\n\t"
"       movq    64(%%ecx), %%mm2\n\t"
"       movq    %%mm0, %%mm1\n\t"
"       movq    %%mm2, %%mm3\n\t"
"       pxor    %%mm7, %%mm1\n\t"
"       pxor    %%mm7, %%mm3\n\t"
"       pfacc   %%mm1, %%mm0\n\t"
"       pfacc   %%mm3, %%mm2\n\t"
"       pfmul   %%mm6, %%mm0\n\t"
"       pfmul   %%mm6, %%mm2\n\t"
"       movq    %%mm0, 32(%%edx)\n\t"
"       movq    %%mm2, 64(%%edx)\n\t"

"       movd    44(%%ecx), %%mm0\n\t"
"       movd    40(%%ecx), %%mm2\n\t"
"       movd    120(%%ebx), %%mm3\n\t"
"       punpckldq 76(%%ecx), %%mm0\n\t"
"       punpckldq 72(%%ecx), %%mm2\n\t"
"       punpckldq %%mm3, %%mm3\n\t"
"       movq    %%mm0, %%mm4\n\t"
"       movq    %%mm2, %%mm5\n\t"
"       pfsub   %%mm2, %%mm0\n\t"
"       pfmul   %%mm3, %%mm0\n\t"
"       movq    %%mm0, %%mm1\n\t"
"       pfadd   %%mm5, %%mm0\n\t"
"       pfadd   %%mm4, %%mm0\n\t"
"       movq    %%mm0, %%mm2\n\t"
"       punpckldq %%mm1, %%mm0\n\t"
"       punpckhdq %%mm1, %%mm2\n\t"
"       movq    %%mm0, 40(%%edx)\n\t"
"       movq    %%mm2, 72(%%edx)\n\t"

"       movd   48(%%ecx), %%mm3\n\t"
"       movd   60(%%ecx), %%mm2\n\t"
"       pfsub  52(%%ecx), %%mm3\n\t"
"       pfsub  56(%%ecx), %%mm2\n\t"
"       pfmul 120(%%ebx), %%mm3\n\t"
"       pfmul 120(%%ebx), %%mm2\n\t"
"       movq    %%mm2, %%mm1\n\t"

"       pfadd  56(%%ecx), %%mm1\n\t"
"       pfadd  60(%%ecx), %%mm1\n\t"
"       movq    %%mm1, %%mm0\n\t"

"       pfadd  48(%%ecx), %%mm0\n\t"
"       pfadd  52(%%ecx), %%mm0\n\t"
"       pfadd   %%mm3, %%mm1\n\t"
"       punpckldq %%mm2, %%mm1\n\t"
"       pfadd   %%mm3, %%mm2\n\t"
"       punpckldq %%mm2, %%mm0\n\t"
"       movq    %%mm1, 56(%%edx)\n\t"
"       movq    %%mm0, 48(%%edx)\n\t"

/*---*/

"       movd   92(%%ecx), %%mm1\n\t"
"       pfsub  88(%%ecx), %%mm1\n\t"
"       pfmul 120(%%ebx), %%mm1\n\t"
"       movd   %%mm1, 92(%%edx)\n\t"
"       pfadd  92(%%ecx), %%mm1\n\t"
"       pfadd  88(%%ecx), %%mm1\n\t"
"       movq   %%mm1, %%mm0\n\t"

"       pfadd  80(%%ecx), %%mm0\n\t"
"       pfadd  84(%%ecx), %%mm0\n\t"
"       movd   %%mm0, 80(%%edx)\n\t"

"       movd   80(%%ecx), %%mm0\n\t"
"       pfsub  84(%%ecx), %%mm0\n\t"
"       pfmul 120(%%ebx), %%mm0\n\t"
"       pfadd  %%mm0, %%mm1\n\t"
"       pfadd  92(%%edx), %%mm0\n\t"
"       punpckldq %%mm1, %%mm0\n\t"
"       movq   %%mm0, 84(%%edx)\n\t"

"       movq    96(%%ecx), %%mm0\n\t"
"       movq    %%mm0, %%mm1\n\t"
"       pxor    %%mm7, %%mm1\n\t"
"       pfacc   %%mm1, %%mm0\n\t"
"       pfmul   %%mm6, %%mm0\n\t"
"       movq    %%mm0, 96(%%edx)\n\t"

"       movd  108(%%ecx), %%mm0\n\t"
"       pfsub 104(%%ecx), %%mm0\n\t"
"       pfmul 120(%%ebx), %%mm0\n\t"
"       movd  %%mm0, 108(%%edx)\n\t"
"       pfadd 104(%%ecx), %%mm0\n\t"
"       pfadd 108(%%ecx), %%mm0\n\t"
"       movd  %%mm0, 104(%%edx)\n\t"

"       movd  124(%%ecx), %%mm1\n\t"
"       pfsub 120(%%ecx), %%mm1\n\t"
"       pfmul 120(%%ebx), %%mm1\n\t"
"       movd  %%mm1, 124(%%edx)\n\t"
"       pfadd 120(%%ecx), %%mm1\n\t"
"       pfadd 124(%%ecx), %%mm1\n\t"
"       movq  %%mm1, %%mm0\n\t"

"       pfadd 112(%%ecx), %%mm0\n\t"
"       pfadd 116(%%ecx), %%mm0\n\t"
"       movd  %%mm0, 112(%%edx)\n\t"

"       movd  112(%%ecx), %%mm0\n\t"
"       pfsub 116(%%ecx), %%mm0\n\t"
"       pfmul 120(%%ebx), %%mm0\n\t"
"       pfadd %%mm0,%%mm1\n\t"
"       pfadd 124(%%edx), %%mm0\n\t"
"       punpckldq %%mm1, %%mm0\n\t"
"       movq  %%mm0, 116(%%edx)\n\t"

// this code is broken, there is nothing modifying the z flag above.
#if 0
"       jnz .L01\n\t"

/* Phase 7*/
/* Code below is coded in scalar mode. Should be optimized */

"       movd      (%%ecx), %%mm0\n\t"
"       pfadd    4(%%ecx), %%mm0\n\t"
"       movd     %%mm0, 1024(%%esi)\n\t"

"       movd      (%%ecx), %%mm0\n\t"
"       pfsub    4(%%ecx), %%mm0\n\t"
"       pfmul  120(%%ebx), %%mm0\n\t"
"       movd      %%mm0, (%%esi)\n\t"
"       movd      %%mm0, (%%edi)\n\t"

"       movd   12(%%ecx), %%mm0\n\t"
"       pfsub   8(%%ecx), %%mm0\n\t"
"       pfmul 120(%%ebx), %%mm0\n\t"
"       movd    %%mm0, 512(%%edi)\n\t"
"       pfadd   12(%%ecx), %%mm0\n\t"
"       pfadd   8(%%ecx), %%mm0\n\t"
"       movd    %%mm0, 512(%%esi)\n\t"

"       movd   16(%%ecx), %%mm0\n\t"
"       pfsub  20(%%ecx), %%mm0\n\t"
"       pfmul 120(%%ebx), %%mm0\n\t"
"       movq    %%mm0, %%mm3\n\t"

"       movd   28(%%ecx), %%mm0\n\t"
"       pfsub  24(%%ecx), %%mm0\n\t"
"       pfmul 120(%%ebx), %%mm0\n\t"
"       movd    %%mm0, 768(%%edi)\n\t"
"       movq    %%mm0, %%mm2\n\t"

"       pfadd  24(%%ecx), %%mm0\n\t"
"       pfadd  28(%%ecx), %%mm0\n\t"
"       movq    %%mm0, %%mm1\n\t"

"       pfadd  16(%%ecx), %%mm0\n\t"
"       pfadd  20(%%ecx), %%mm0\n\t"
"       movd   %%mm0, 768(%%esi)\n\t"
"       pfadd  %%mm3, %%mm1\n\t"
"       movd   %%mm1, 256(%%esi)\n\t"
"       pfadd  %%mm3, %%mm2\n\t"
"       movd   %%mm2, 256(%%edi)\n\t"

/* Phase 8*/

"       movq   32(%%edx), %%mm0\n\t"
"       movq   48(%%edx), %%mm1\n\t"
"       pfadd  48(%%edx), %%mm0\n\t"
"       pfadd  40(%%edx), %%mm1\n\t"
"       movd   %%mm0, 896(%%esi)\n\t"
"       movd   %%mm1, 640(%%esi)\n\t"
"       psrlq  $32, %%mm0\n\t"
"       psrlq  $32, %%mm1\n\t"
"       movd   %%mm0, 128(%%edi)\n\t"
"       movd   %%mm1, 384(%%edi)\n\t"

"       movd   40(%%edx), %%mm0\n\t"
"       pfadd  56(%%edx), %%mm0\n\t"
"       movd   %%mm0, 384(%%esi)\n\t"

"       movd   56(%%edx), %%mm0\n\t"
"       pfadd  36(%%edx), %%mm0\n\t"
"       movd   %%mm0, 128(%%esi)\n\t"

"       movd   60(%%edx), %%mm0\n\t"
"       movd   %%mm0, 896(%%edi)\n\t"
"       pfadd  44(%%edx), %%mm0\n\t"
"       movd   %%mm0, 640(%%edi)\n\t"

"       movq   96(%%edx), %%mm0\n\t"
"       movq   112(%%edx), %%mm2\n\t"
"       movq   104(%%edx), %%mm4\n\t"
"       pfadd  112(%%edx), %%mm0\n\t"
"       pfadd  104(%%edx), %%mm2\n\t"
"       pfadd  120(%%edx), %%mm4\n\t"
"       movq   %%mm0, %%mm1\n\t"
"       movq   %%mm2, %%mm3\n\t"
"       movq   %%mm4, %%mm5\n\t"
"       pfadd  64(%%edx), %%mm0\n\t"
"       pfadd  80(%%edx), %%mm2\n\t"
"       pfadd  72(%%edx), %%mm4\n\t"
"       movd   %%mm0, 960(%%esi)\n\t"
"       movd   %%mm2, 704(%%esi)\n\t"
"       movd   %%mm4, 448(%%esi)\n\t"
"       psrlq  $32, %%mm0\n\t"
"       psrlq  $32, %%mm2\n\t"
"       psrlq  $32, %%mm4\n\t"
"       movd   %%mm0, 64(%%edi)\n\t"
"       movd   %%mm2, 320(%%edi)\n\t"
"       movd   %%mm4, 576(%%edi)\n\t"
"       pfadd  80(%%edx), %%mm1\n\t"
"       pfadd  72(%%edx), %%mm3\n\t"
"       pfadd  88(%%edx), %%mm5\n\t"
"       movd   %%mm1, 832(%%esi)\n\t"
"       movd   %%mm3, 576(%%esi)\n\t"
"       movd   %%mm5, 320(%%esi)\n\t"
"       psrlq  $32, %%mm1\n\t"
"       psrlq  $32, %%mm3\n\t"
"       psrlq  $32, %%mm5\n\t"
"       movd   %%mm1, 192(%%edi)\n\t"
"       movd   %%mm3, 448(%%edi)\n\t"
"       movd   %%mm5, 704(%%edi)\n\t"

"       movd   120(%%edx), %%mm0\n\t"
"       pfadd  100(%%edx), %%mm0\n\t"
"       movq   %%mm0, %%mm1\n\t"
"       pfadd  88(%%edx), %%mm0\n\t"
"       movd   %%mm0, 192(%%esi)\n\t"
"       pfadd  68(%%edx), %%mm1\n\t"
"       movd   %%mm1, 64(%%esi)\n\t"

"       movd  124(%%edx), %%mm0\n\t"
"       movd  %%mm0, 960(%%edi)\n\t"
"       pfadd  92(%%edx), %%mm0\n\t"
"       movd  %%mm0, 832(%%edi)\n\t"

"       jmp     .L_bye\n\t"
".L01:  \n\t"
#endif
/* Phase 9*/

"       movq    (%%ecx), %%mm0\n\t"
"       movq    %%mm0, %%mm1\n\t"
"       pxor    %%mm7, %%mm1\n\t"
"       pfacc   %%mm1, %%mm0\n\t"
"       pfmul   %%mm6, %%mm0\n\t"
"       pf2iw   %%mm0, %%mm0\n\t"
"       movd    %%mm0, %%eax\n\t"
"       movw    %%ax, 512(%%esi)\n\t"
"       psrlq   $32, %%mm0\n\t"
"       movd    %%mm0, %%eax\n\t"
"       movw    %%ax, (%%esi)\n\t"

"       movd    12(%%ecx), %%mm0\n\t"
"       pfsub    8(%%ecx), %%mm0\n\t"
"       pfmul  120(%%ebx), %%mm0\n\t"
"       pf2iw    %%mm0, %%mm7\n\t"
"       movd     %%mm7, %%eax\n\t"
"       movw     %%ax, 256(%%edi)\n\t"
"       pfadd   12(%%ecx), %%mm0\n\t"
"       pfadd    8(%%ecx), %%mm0\n\t"
"       pf2iw    %%mm0, %%mm0\n\t"
"       movd     %%mm0, %%eax\n\t"
"       movw     %%ax, 256(%%esi)\n\t"

"       movd   16(%%ecx), %%mm3\n\t"
"       pfsub  20(%%ecx), %%mm3\n\t"
"       pfmul  120(%%ebx), %%mm3\n\t"
"       movq   %%mm3, %%mm2\n\t"

"       movd   28(%%ecx), %%mm2\n\t"
"       pfsub  24(%%ecx), %%mm2\n\t"
"       pfmul 120(%%ebx), %%mm2\n\t"
"       movq   %%mm2, %%mm1\n\t"

"       pf2iw  %%mm2, %%mm7\n\t"
"       movd   %%mm7, %%eax\n\t"
"       movw   %%ax, 384(%%edi)\n\t"

"       pfadd  24(%%ecx), %%mm1\n\t"
"       pfadd  28(%%ecx), %%mm1\n\t"
"       movq   %%mm1, %%mm0\n\t"

"       pfadd  16(%%ecx), %%mm0\n\t"
"       pfadd  20(%%ecx), %%mm0\n\t"
"       pf2iw  %%mm0, %%mm0\n\t"
"       movd   %%mm0, %%eax\n\t"
"       movw   %%ax, 384(%%esi)\n\t"
"       pfadd  %%mm3, %%mm1\n\t"
"       pf2iw  %%mm1, %%mm1\n\t"
"       movd   %%mm1, %%eax\n\t"
"       movw   %%ax, 128(%%esi)\n\t"
"       pfadd  %%mm3, %%mm2\n\t"
"       pf2iw  %%mm2, %%mm2\n\t"
"       movd   %%mm2, %%eax\n\t"
"       movw   %%ax, 128(%%edi)\n\t"

/* Phase 10*/

"       movq    32(%%edx), %%mm0\n\t"
"       movq    48(%%edx), %%mm1\n\t"
"       pfadd   48(%%edx), %%mm0\n\t"
"       pfadd   40(%%edx), %%mm1\n\t"
"       pf2iw   %%mm0, %%mm0\n\t"
"       pf2iw   %%mm1, %%mm1\n\t"
"       movd    %%mm0, %%eax\n\t"
"       movd    %%mm1, %%ecx\n\t"
"       movw    %%ax, 448(%%esi)\n\t"
"       movw    %%cx, 320(%%esi)\n\t"
"       psrlq   $32, %%mm0\n\t"
"       psrlq   $32, %%mm1\n\t"
"       movd    %%mm0, %%eax\n\t"
"       movd    %%mm1, %%ecx\n\t"
"       movw    %%ax, 64(%%edi)\n\t"
"       movw    %%cx, 192(%%edi)\n\t"

"       movd   40(%%edx), %%mm3\n\t"
"       movd   56(%%edx), %%mm4\n\t"
"       movd   60(%%edx), %%mm0\n\t"
"       movd   44(%%edx), %%mm2\n\t"
"       movd  120(%%edx), %%mm5\n\t"
"       punpckldq %%mm4, %%mm3\n\t"
"       punpckldq 124(%%edx), %%mm0\n\t"
"       pfadd 100(%%edx), %%mm5\n\t"
"       punpckldq 36(%%edx), %%mm4\n\t"
"       punpckldq 92(%%edx), %%mm2\n\t"
"       movq  %%mm5, %%mm6\n\t"
"       pfadd  %%mm4, %%mm3\n\t"
"       pf2iw  %%mm0, %%mm1\n\t"
"       pf2iw  %%mm3, %%mm3\n\t"
"       pfadd  88(%%edx), %%mm5\n\t"
"       movd   %%mm1, %%eax\n\t"
"       movd   %%mm3, %%ecx\n\t"
"       movw   %%ax, 448(%%edi)\n\t"
"       movw   %%cx, 192(%%esi)\n\t"
"       pf2iw  %%mm5, %%mm5\n\t"
"       psrlq  $32, %%mm1\n\t"
"       psrlq  $32, %%mm3\n\t"
"       movd   %%mm5, %%ebx\n\t"
"       movd   %%mm1, %%eax\n\t"
"       movd   %%mm3, %%ecx\n\t"
"       movw   %%bx, 96(%%esi)\n\t"
"       movw   %%ax, 480(%%edi)\n\t"
"       movw   %%cx, 64(%%esi)\n\t"
"       pfadd  %%mm2, %%mm0\n\t"
"       pf2iw  %%mm0, %%mm0\n\t"
"       movd   %%mm0, %%eax\n\t"
"       pfadd  68(%%edx), %%mm6\n\t"
"       movw   %%ax, 320(%%edi)\n\t"
"       psrlq  $32, %%mm0\n\t"
"       pf2iw  %%mm6, %%mm6\n\t"
"       movd   %%mm0, %%eax\n\t"
"       movd   %%mm6, %%ebx\n\t"
"       movw   %%ax, 416(%%edi)\n\t"
"       movw   %%bx, 32(%%esi)\n\t"

"       movq   96(%%edx), %%mm0\n\t"
"       movq  112(%%edx), %%mm2\n\t"
"       movq  104(%%edx), %%mm4\n\t"
"       pfadd %%mm2, %%mm0\n\t"
"       pfadd %%mm4, %%mm2\n\t"
"       pfadd 120(%%edx), %%mm4\n\t"
"       movq  %%mm0, %%mm1\n\t"
"       movq  %%mm2, %%mm3\n\t"
"       movq  %%mm4, %%mm5\n\t"
"       pfadd  64(%%edx), %%mm0\n\t"
"       pfadd  80(%%edx), %%mm2\n\t"
"       pfadd  72(%%edx), %%mm4\n\t"
"       pf2iw  %%mm0, %%mm0\n\t"
"       pf2iw  %%mm2, %%mm2\n\t"
"       pf2iw  %%mm4, %%mm4\n\t"
"       movd   %%mm0, %%eax\n\t"
"       movd   %%mm2, %%ecx\n\t"
"       movd   %%mm4, %%ebx\n\t"
"       movw   %%ax, 480(%%esi)\n\t"
"       movw   %%cx, 352(%%esi)\n\t"
"       movw   %%bx, 224(%%esi)\n\t"
"       psrlq  $32, %%mm0\n\t"
"       psrlq  $32, %%mm2\n\t"
"       psrlq  $32, %%mm4\n\t"
"       movd   %%mm0, %%eax\n\t"
"       movd   %%mm2, %%ecx\n\t"
"       movd   %%mm4, %%ebx\n\t"
"       movw   %%ax, 32(%%edi)\n\t"
"       movw   %%cx, 160(%%edi)\n\t"
"       movw   %%bx, 288(%%edi)\n\t"
"       pfadd  80(%%edx), %%mm1\n\t"
"       pfadd  72(%%edx), %%mm3\n\t"
"       pfadd  88(%%edx), %%mm5\n\t"
"       pf2iw  %%mm1, %%mm1\n\t"
"       pf2iw  %%mm3, %%mm3\n\t"
"       pf2iw  %%mm5, %%mm5\n\t"
"       movd   %%mm1, %%eax\n\t"
"       movd   %%mm3, %%ecx\n\t"
"       movd   %%mm5, %%ebx\n\t"
"       movw   %%ax, 416(%%esi)\n\t"
"       movw   %%cx, 288(%%esi)\n\t"
"       movw   %%bx, 160(%%esi)\n\t"
"       psrlq  $32, %%mm1\n\t"
"       psrlq  $32, %%mm3\n\t"
"       psrlq  $32, %%mm5\n\t"
"       movd   %%mm1, %%eax\n\t"
"       movd   %%mm3, %%ecx\n\t"
"       movd   %%mm5, %%ebx\n\t"
"       movw   %%ax, 96(%%edi)\n\t"
"       movw   %%cx, 224(%%edi)\n\t"
"       movw   %%bx, 352(%%edi)\n\t"

"       movsw\n\t"

".L_bye:\n\t"
"       femms\n\t"
        :
        :"m"(a),"m"(b),"m"(c),"m"(tmp[0])
        :"memory","%eax","%ebx","%ecx","%edx","%esi","%edi");
}