view liba52/downmix.c @ 15721:eec6ace22741

small change to field-matching metrics which hopefully makes a big improvement to results. inter-field comparison is now counterbalanced with intra-field total (vertical) variation. this means that areas of extreme high frequency content, which become aliased within individual fields, will not interfere with field matching. examples: white noise effects, small kanji, very small latin text, ... may still need tweaking. please report regressions. this change will likely be made optional in the future (right now it's enclosed in "if (1)"...
author rfelker
date Tue, 14 Jun 2005 05:33:34 +0000
parents 07f1e7669772
children d6219ce521e9
line wrap: on
line source

/*
 * downmix.c
 * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
 *
 * This file is part of a52dec, a free ATSC A-52 stream decoder.
 * See http://liba52.sourceforge.net/ for updates.
 *
 * Modified for use with MPlayer, changes contained in liba52_changes.diff.
 * detailed CVS changelog at http://www.mplayerhq.hu/cgi-bin/cvsweb.cgi/main/
 * $Id$
 *
 * a52dec is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * a52dec is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
 */

#include "config.h"

#include <string.h>
#include <inttypes.h>

#include "a52.h"
#include "a52_internal.h"
#include "mm_accel.h"

#define CONVERT(acmod,output) (((output) << 3) + (acmod))


void (*downmix)(sample_t * samples, int acmod, int output, sample_t bias,
	      sample_t clev, sample_t slev)= NULL;
void (*upmix)(sample_t * samples, int acmod, int output)= NULL;

static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias,
	      sample_t clev, sample_t slev);
static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias,
	      sample_t clev, sample_t slev);
static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias,
	      sample_t clev, sample_t slev);
static void upmix_MMX (sample_t * samples, int acmod, int output);
static void upmix_C (sample_t * samples, int acmod, int output);

void downmix_accel_init(uint32_t mm_accel)
{
    upmix= upmix_C;
    downmix= downmix_C;
#ifdef ARCH_X86    
    if(mm_accel & MM_ACCEL_X86_MMX) upmix= upmix_MMX;
    if(mm_accel & MM_ACCEL_X86_SSE) downmix= downmix_SSE;
    if(mm_accel & MM_ACCEL_X86_3DNOW) downmix= downmix_3dnow;
#endif
}
   
int downmix_init (int input, int flags, sample_t * level,
		  sample_t clev, sample_t slev)
{
    static uint8_t table[11][8] = {
	{A52_CHANNEL,	A52_DOLBY,	A52_STEREO,	A52_STEREO,
	 A52_STEREO,	A52_STEREO,	A52_STEREO,	A52_STEREO},
	{A52_MONO,	A52_MONO,	A52_MONO,	A52_MONO,
	 A52_MONO,	A52_MONO,	A52_MONO,	A52_MONO},
	{A52_CHANNEL,	A52_DOLBY,	A52_STEREO,	A52_STEREO,
	 A52_STEREO,	A52_STEREO,	A52_STEREO,	A52_STEREO},
	{A52_CHANNEL,	A52_DOLBY,	A52_STEREO,	A52_3F,
	 A52_STEREO,	A52_3F,		A52_STEREO,	A52_3F},
	{A52_CHANNEL,	A52_DOLBY,	A52_STEREO,	A52_STEREO,
	 A52_2F1R,	A52_2F1R,	A52_2F1R,	A52_2F1R},
	{A52_CHANNEL,	A52_DOLBY,	A52_STEREO,	A52_STEREO,
	 A52_2F1R,	A52_3F1R,	A52_2F1R,	A52_3F1R},
	{A52_CHANNEL,	A52_DOLBY,	A52_STEREO,	A52_3F,
	 A52_2F2R,	A52_2F2R,	A52_2F2R,	A52_2F2R},
	{A52_CHANNEL,	A52_DOLBY,	A52_STEREO,	A52_3F,
	 A52_2F2R,	A52_3F2R,	A52_2F2R,	A52_3F2R},
	{A52_CHANNEL1,	A52_MONO,	A52_MONO,	A52_MONO,
	 A52_MONO,	A52_MONO,	A52_MONO,	A52_MONO},
	{A52_CHANNEL2,	A52_MONO,	A52_MONO,	A52_MONO,
	 A52_MONO,	A52_MONO,	A52_MONO,	A52_MONO},
	{A52_CHANNEL,	A52_DOLBY,	A52_STEREO,	A52_DOLBY,
	 A52_DOLBY,	A52_DOLBY,	A52_DOLBY,	A52_DOLBY}
    };
    int output;

    output = flags & A52_CHANNEL_MASK;
    if (output > A52_DOLBY)
	return -1;
    
    output = table[output][input & 7];

    if ((output == A52_STEREO) &&
	((input == A52_DOLBY) || ((input == A52_3F) && (clev == LEVEL_3DB))))
	output = A52_DOLBY;

    if (flags & A52_ADJUST_LEVEL)
	switch (CONVERT (input & 7, output)) {

	case CONVERT (A52_3F, A52_MONO):
	    *level *= LEVEL_3DB / (1 + clev);
	    break;

	case CONVERT (A52_STEREO, A52_MONO):
	case CONVERT (A52_2F2R, A52_2F1R):
	case CONVERT (A52_3F2R, A52_3F1R):
	level_3db:
	    *level *= LEVEL_3DB;
	    break;

	case CONVERT (A52_3F2R, A52_2F1R):
	    if (clev < LEVEL_PLUS3DB - 1)
		goto level_3db;
	    /* break thru */
	case CONVERT (A52_3F, A52_STEREO):
	case CONVERT (A52_3F1R, A52_2F1R):
	case CONVERT (A52_3F1R, A52_2F2R):
	case CONVERT (A52_3F2R, A52_2F2R):
	    *level /= 1 + clev;
	    break;

	case CONVERT (A52_2F1R, A52_MONO):
	    *level *= LEVEL_PLUS3DB / (2 + slev);
	    break;

	case CONVERT (A52_2F1R, A52_STEREO):
	case CONVERT (A52_3F1R, A52_3F):
	    *level /= 1 + slev * LEVEL_3DB;
	    break;

	case CONVERT (A52_3F1R, A52_MONO):
	    *level *= LEVEL_3DB / (1 + clev + 0.5 * slev);
	    break;

	case CONVERT (A52_3F1R, A52_STEREO):
	    *level /= 1 + clev + slev * LEVEL_3DB;
	    break;

	case CONVERT (A52_2F2R, A52_MONO):
	    *level *= LEVEL_3DB / (1 + slev);
	    break;

	case CONVERT (A52_2F2R, A52_STEREO):
	case CONVERT (A52_3F2R, A52_3F):
	    *level /= 1 + slev;
	    break;

	case CONVERT (A52_3F2R, A52_MONO):
	    *level *= LEVEL_3DB / (1 + clev + slev);
	    break;

	case CONVERT (A52_3F2R, A52_STEREO):
	    *level /= 1 + clev + slev;
	    break;

	case CONVERT (A52_MONO, A52_DOLBY):
	    *level *= LEVEL_PLUS3DB;
	    break;

	case CONVERT (A52_3F, A52_DOLBY):
	case CONVERT (A52_2F1R, A52_DOLBY):
	    *level *= 1 / (1 + LEVEL_3DB);
	    break;

	case CONVERT (A52_3F1R, A52_DOLBY):
	case CONVERT (A52_2F2R, A52_DOLBY):
	    *level *= 1 / (1 + 2 * LEVEL_3DB);
	    break;

	case CONVERT (A52_3F2R, A52_DOLBY):
	    *level *= 1 / (1 + 3 * LEVEL_3DB);
	    break;
	}
    return output;
}

int downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level,
		   sample_t clev, sample_t slev)
{
    switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {

    case CONVERT (A52_CHANNEL, A52_CHANNEL):
    case CONVERT (A52_MONO, A52_MONO):
    case CONVERT (A52_STEREO, A52_STEREO):
    case CONVERT (A52_3F, A52_3F):
    case CONVERT (A52_2F1R, A52_2F1R):
    case CONVERT (A52_3F1R, A52_3F1R):
    case CONVERT (A52_2F2R, A52_2F2R):
    case CONVERT (A52_3F2R, A52_3F2R):
    case CONVERT (A52_STEREO, A52_DOLBY):
	coeff[0] = coeff[1] = coeff[2] = coeff[3] = coeff[4] = level;
	return 0;

    case CONVERT (A52_CHANNEL, A52_MONO):
	coeff[0] = coeff[1] = level * LEVEL_6DB;
	return 3;

    case CONVERT (A52_STEREO, A52_MONO):
	coeff[0] = coeff[1] = level * LEVEL_3DB;
	return 3;

    case CONVERT (A52_3F, A52_MONO):
	coeff[0] = coeff[2] = level * LEVEL_3DB;
	coeff[1] = level * clev * LEVEL_PLUS3DB;
	return 7;

    case CONVERT (A52_2F1R, A52_MONO):
	coeff[0] = coeff[1] = level * LEVEL_3DB;
	coeff[2] = level * slev * LEVEL_3DB;
	return 7;

    case CONVERT (A52_2F2R, A52_MONO):
	coeff[0] = coeff[1] = level * LEVEL_3DB;
	coeff[2] = coeff[3] = level * slev * LEVEL_3DB;
	return 15;

    case CONVERT (A52_3F1R, A52_MONO):
	coeff[0] = coeff[2] = level * LEVEL_3DB;
	coeff[1] = level * clev * LEVEL_PLUS3DB;
	coeff[3] = level * slev * LEVEL_3DB;
	return 15;

    case CONVERT (A52_3F2R, A52_MONO):
	coeff[0] = coeff[2] = level * LEVEL_3DB;
	coeff[1] = level * clev * LEVEL_PLUS3DB;
	coeff[3] = coeff[4] = level * slev * LEVEL_3DB;
	return 31;

    case CONVERT (A52_MONO, A52_DOLBY):
	coeff[0] = level * LEVEL_3DB;
	return 0;

    case CONVERT (A52_3F, A52_DOLBY):
	clev = LEVEL_3DB;
    case CONVERT (A52_3F, A52_STEREO):
    case CONVERT (A52_3F1R, A52_2F1R):
    case CONVERT (A52_3F2R, A52_2F2R):
	coeff[0] = coeff[2] = coeff[3] = coeff[4] = level;
	coeff[1] = level * clev;
	return 7;

    case CONVERT (A52_2F1R, A52_DOLBY):
	slev = 1;
    case CONVERT (A52_2F1R, A52_STEREO):
	coeff[0] = coeff[1] = level;
	coeff[2] = level * slev * LEVEL_3DB;
	return 7;

    case CONVERT (A52_3F1R, A52_DOLBY):
	clev = LEVEL_3DB;
	slev = 1;
    case CONVERT (A52_3F1R, A52_STEREO):
	coeff[0] = coeff[2] = level;
	coeff[1] = level * clev;
	coeff[3] = level * slev * LEVEL_3DB;
	return 15;

    case CONVERT (A52_2F2R, A52_DOLBY):
	slev = LEVEL_3DB;
    case CONVERT (A52_2F2R, A52_STEREO):
	coeff[0] = coeff[1] = level;
	coeff[2] = coeff[3] = level * slev;
	return 15;

    case CONVERT (A52_3F2R, A52_DOLBY):
	clev = LEVEL_3DB;
    case CONVERT (A52_3F2R, A52_2F1R):
	slev = LEVEL_3DB;
    case CONVERT (A52_3F2R, A52_STEREO):
	coeff[0] = coeff[2] = level;
	coeff[1] = level * clev;
	coeff[3] = coeff[4] = level * slev;
	return 31;

    case CONVERT (A52_3F1R, A52_3F):
	coeff[0] = coeff[1] = coeff[2] = level;
	coeff[3] = level * slev * LEVEL_3DB;
	return 13;

    case CONVERT (A52_3F2R, A52_3F):
	coeff[0] = coeff[1] = coeff[2] = level;
	coeff[3] = coeff[4] = level * slev;
	return 29;

    case CONVERT (A52_2F2R, A52_2F1R):
	coeff[0] = coeff[1] = level;
	coeff[2] = coeff[3] = level * LEVEL_3DB;
	return 12;

    case CONVERT (A52_3F2R, A52_3F1R):
	coeff[0] = coeff[1] = coeff[2] = level;
	coeff[3] = coeff[4] = level * LEVEL_3DB;
	return 24;

    case CONVERT (A52_2F1R, A52_2F2R):
	coeff[0] = coeff[1] = level;
	coeff[2] = level * LEVEL_3DB;
	return 0;

    case CONVERT (A52_3F1R, A52_2F2R):
	coeff[0] = coeff[2] = level;
	coeff[1] = level * clev;
	coeff[3] = level * LEVEL_3DB;
	return 7;

    case CONVERT (A52_3F1R, A52_3F2R):
	coeff[0] = coeff[1] = coeff[2] = level;
	coeff[3] = level * LEVEL_3DB;
	return 0;

    case CONVERT (A52_CHANNEL, A52_CHANNEL1):
	coeff[0] = level;
	coeff[1] = 0;
	return 0;

    case CONVERT (A52_CHANNEL, A52_CHANNEL2):
	coeff[0] = 0;
	coeff[1] = level;
	return 0;
    }

    return -1;	/* NOTREACHED */
}

static void mix2to1 (sample_t * dest, sample_t * src, sample_t bias)
{
    int i;

    for (i = 0; i < 256; i++)
	dest[i] += src[i] + bias;
}

static void mix3to1 (sample_t * samples, sample_t bias)
{
    int i;

    for (i = 0; i < 256; i++)
	samples[i] += samples[i + 256] + samples[i + 512] + bias;
}

static void mix4to1 (sample_t * samples, sample_t bias)
{
    int i;

    for (i = 0; i < 256; i++)
	samples[i] += (samples[i + 256] + samples[i + 512] +
		       samples[i + 768] + bias);
}

static void mix5to1 (sample_t * samples, sample_t bias)
{
    int i;

    for (i = 0; i < 256; i++)
	samples[i] += (samples[i + 256] + samples[i + 512] +
		       samples[i + 768] + samples[i + 1024] + bias);
}

static void mix3to2 (sample_t * samples, sample_t bias)
{
    int i;
    sample_t common;

    for (i = 0; i < 256; i++) {
	common = samples[i + 256] + bias;
	samples[i] += common;
	samples[i + 256] = samples[i + 512] + common;
    }
}

static void mix21to2 (sample_t * left, sample_t * right, sample_t bias)
{
    int i;
    sample_t common;

    for (i = 0; i < 256; i++) {
	common = right[i + 256] + bias;
	left[i] += common;
	right[i] += common;
    }
}

static void mix21toS (sample_t * samples, sample_t bias)
{
    int i;
    sample_t surround;

    for (i = 0; i < 256; i++) {
	surround = samples[i + 512];
	samples[i] += bias - surround;
	samples[i + 256] += bias + surround;
    }
}

static void mix31to2 (sample_t * samples, sample_t bias)
{
    int i;
    sample_t common;

    for (i = 0; i < 256; i++) {
	common = samples[i + 256] + samples[i + 768] + bias;
	samples[i] += common;
	samples[i + 256] = samples[i + 512] + common;
    }
}

static void mix31toS (sample_t * samples, sample_t bias)
{
    int i;
    sample_t common, surround;

    for (i = 0; i < 256; i++) {
	common = samples[i + 256] + bias;
	surround = samples[i + 768];
	samples[i] += common - surround;
	samples[i + 256] = samples[i + 512] + common + surround;
    }
}

static void mix22toS (sample_t * samples, sample_t bias)
{
    int i;
    sample_t surround;

    for (i = 0; i < 256; i++) {
	surround = samples[i + 512] + samples[i + 768];
	samples[i] += bias - surround;
	samples[i + 256] += bias + surround;
    }
}

static void mix32to2 (sample_t * samples, sample_t bias)
{
    int i;
    sample_t common;

    for (i = 0; i < 256; i++) {
	common = samples[i + 256] + bias;
	samples[i] += common + samples[i + 768];
	samples[i + 256] = common + samples[i + 512] + samples[i + 1024];
    }
}

static void mix32toS (sample_t * samples, sample_t bias)
{
    int i;
    sample_t common, surround;

    for (i = 0; i < 256; i++) {
	common = samples[i + 256] + bias;
	surround = samples[i + 768] + samples[i + 1024];
	samples[i] += common - surround;
	samples[i + 256] = samples[i + 512] + common + surround;
    }
}

static void move2to1 (sample_t * src, sample_t * dest, sample_t bias)
{
    int i;

    for (i = 0; i < 256; i++)
	dest[i] = src[i] + src[i + 256] + bias;
}

static void zero (sample_t * samples)
{
    int i;
    for (i = 0; i < 256; i++)
	samples[i] = 0;
}

static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias,
	      sample_t clev, sample_t slev)
{
    switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {

    case CONVERT (A52_CHANNEL, A52_CHANNEL2):
	memcpy (samples, samples + 256, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_CHANNEL, A52_MONO):
    case CONVERT (A52_STEREO, A52_MONO):
    mix_2to1:
	mix2to1 (samples, samples + 256, bias);
	break;

    case CONVERT (A52_2F1R, A52_MONO):
	if (slev == 0)
	    goto mix_2to1;
    case CONVERT (A52_3F, A52_MONO):
    mix_3to1:
	mix3to1 (samples, bias);
	break;

    case CONVERT (A52_3F1R, A52_MONO):
	if (slev == 0)
	    goto mix_3to1;
    case CONVERT (A52_2F2R, A52_MONO):
	if (slev == 0)
	    goto mix_2to1;
	mix4to1 (samples, bias);
	break;

    case CONVERT (A52_3F2R, A52_MONO):
	if (slev == 0)
	    goto mix_3to1;
	mix5to1 (samples, bias);
	break;

    case CONVERT (A52_MONO, A52_DOLBY):
	memcpy (samples + 256, samples, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_3F, A52_STEREO):
    case CONVERT (A52_3F, A52_DOLBY):
    mix_3to2:
	mix3to2 (samples, bias);
	break;

    case CONVERT (A52_2F1R, A52_STEREO):
	if (slev == 0)
	    break;
	mix21to2 (samples, samples + 256, bias);
	break;

    case CONVERT (A52_2F1R, A52_DOLBY):
	mix21toS (samples, bias);
	break;

    case CONVERT (A52_3F1R, A52_STEREO):
	if (slev == 0)
	    goto mix_3to2;
	mix31to2 (samples, bias);
	break;

    case CONVERT (A52_3F1R, A52_DOLBY):
	mix31toS (samples, bias);
	break;

    case CONVERT (A52_2F2R, A52_STEREO):
	if (slev == 0)
	    break;
	mix2to1 (samples, samples + 512, bias);
	mix2to1 (samples + 256, samples + 768, bias);
	break;

    case CONVERT (A52_2F2R, A52_DOLBY):
	mix22toS (samples, bias);
	break;

    case CONVERT (A52_3F2R, A52_STEREO):
	if (slev == 0)
	    goto mix_3to2;
	mix32to2 (samples, bias);
	break;

    case CONVERT (A52_3F2R, A52_DOLBY):
	mix32toS (samples, bias);
	break;

    case CONVERT (A52_3F1R, A52_3F):
	if (slev == 0)
	    break;
	mix21to2 (samples, samples + 512, bias);
	break;

    case CONVERT (A52_3F2R, A52_3F):
	if (slev == 0)
	    break;
	mix2to1 (samples, samples + 768, bias);
	mix2to1 (samples + 512, samples + 1024, bias);
	break;

    case CONVERT (A52_3F1R, A52_2F1R):
	mix3to2 (samples, bias);
	memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_2F2R, A52_2F1R):
	mix2to1 (samples + 512, samples + 768, bias);
	break;

    case CONVERT (A52_3F2R, A52_2F1R):
	mix3to2 (samples, bias); //FIXME possible bug? (output doesnt seem to be used)
	move2to1 (samples + 768, samples + 512, bias);
	break;

    case CONVERT (A52_3F2R, A52_3F1R):
	mix2to1 (samples + 768, samples + 1024, bias);
	break;

    case CONVERT (A52_2F1R, A52_2F2R):
	memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_3F1R, A52_2F2R):
	mix3to2 (samples, bias);
	memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_3F2R, A52_2F2R):
	mix3to2 (samples, bias);
	memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
	memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_3F1R, A52_3F2R):
	memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
	break;
    }
}

static void upmix_C (sample_t * samples, int acmod, int output)
{
    switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {

    case CONVERT (A52_CHANNEL, A52_CHANNEL2):
	memcpy (samples + 256, samples, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_3F2R, A52_MONO):
	zero (samples + 1024);
    case CONVERT (A52_3F1R, A52_MONO):
    case CONVERT (A52_2F2R, A52_MONO):
	zero (samples + 768);
    case CONVERT (A52_3F, A52_MONO):
    case CONVERT (A52_2F1R, A52_MONO):
	zero (samples + 512);
    case CONVERT (A52_CHANNEL, A52_MONO):
    case CONVERT (A52_STEREO, A52_MONO):
	zero (samples + 256);
	break;

    case CONVERT (A52_3F2R, A52_STEREO):
    case CONVERT (A52_3F2R, A52_DOLBY):
	zero (samples + 1024);
    case CONVERT (A52_3F1R, A52_STEREO):
    case CONVERT (A52_3F1R, A52_DOLBY):
	zero (samples + 768);
    case CONVERT (A52_3F, A52_STEREO):
    case CONVERT (A52_3F, A52_DOLBY):
    mix_3to2:
	memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t));
	zero (samples + 256);
	break;

    case CONVERT (A52_2F2R, A52_STEREO):
    case CONVERT (A52_2F2R, A52_DOLBY):
	zero (samples + 768);
    case CONVERT (A52_2F1R, A52_STEREO):
    case CONVERT (A52_2F1R, A52_DOLBY):
	zero (samples + 512);
	break;

    case CONVERT (A52_3F2R, A52_3F):
	zero (samples + 1024);
    case CONVERT (A52_3F1R, A52_3F):
    case CONVERT (A52_2F2R, A52_2F1R):
	zero (samples + 768);
	break;

    case CONVERT (A52_3F2R, A52_3F1R):
	zero (samples + 1024);
	break;

    case CONVERT (A52_3F2R, A52_2F1R):
	zero (samples + 1024);
    case CONVERT (A52_3F1R, A52_2F1R):
    mix_31to21:
	memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));
	goto mix_3to2;

    case CONVERT (A52_3F2R, A52_2F2R):
	memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
	goto mix_31to21;
    }
}

#ifdef ARCH_X86
static void mix2to1_SSE (sample_t * dest, sample_t * src, sample_t bias)
{
	asm volatile(
	"movlps %2, %%xmm7		\n\t"
	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
	"movl $-1024, %%esi		\n\t"
	".balign 16\n\t"
	"1:				\n\t"
	"movaps (%0, %%esi), %%xmm0	\n\t" 
	"movaps 16(%0, %%esi), %%xmm1	\n\t" 
	"addps (%1, %%esi), %%xmm0	\n\t" 
	"addps 16(%1, %%esi), %%xmm1	\n\t" 
	"addps %%xmm7, %%xmm0		\n\t"
	"addps %%xmm7, %%xmm1		\n\t"
	"movaps %%xmm0, (%1, %%esi)	\n\t"
	"movaps %%xmm1, 16(%1, %%esi)	\n\t"
	"addl $32, %%esi		\n\t"
	" jnz 1b			\n\t"
	:: "r" (src+256), "r" (dest+256), "m" (bias)
	: "%esi"
	);
}

static void mix3to1_SSE (sample_t * samples, sample_t bias)
{
	asm volatile(
	"movlps %1, %%xmm7		\n\t"
	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
	"movl $-1024, %%esi		\n\t"
	".balign 16\n\t"
	"1:				\n\t"
	"movaps (%0, %%esi), %%xmm0	\n\t" 
	"movaps 1024(%0, %%esi), %%xmm1	\n\t" 
	"addps 2048(%0, %%esi), %%xmm0	\n\t" 
	"addps %%xmm7, %%xmm1		\n\t"
	"addps %%xmm1, %%xmm0		\n\t"
	"movaps %%xmm0, (%0, %%esi)	\n\t"
	"addl $16, %%esi		\n\t"
	" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix4to1_SSE (sample_t * samples, sample_t bias)
{
	asm volatile(
	"movlps %1, %%xmm7		\n\t"
	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
	"movl $-1024, %%esi		\n\t"
	".balign 16\n\t"
	"1:				\n\t"
	"movaps (%0, %%esi), %%xmm0	\n\t" 
	"movaps 1024(%0, %%esi), %%xmm1	\n\t" 
	"addps 2048(%0, %%esi), %%xmm0	\n\t" 
	"addps 3072(%0, %%esi), %%xmm1	\n\t" 
	"addps %%xmm7, %%xmm0		\n\t"
	"addps %%xmm1, %%xmm0		\n\t"
	"movaps %%xmm0, (%0, %%esi)	\n\t"
	"addl $16, %%esi		\n\t"
	" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix5to1_SSE (sample_t * samples, sample_t bias)
{
	asm volatile(
	"movlps %1, %%xmm7		\n\t"
	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
	"movl $-1024, %%esi		\n\t"
	".balign 16\n\t"
	"1:				\n\t"
	"movaps (%0, %%esi), %%xmm0	\n\t" 
	"movaps 1024(%0, %%esi), %%xmm1	\n\t" 
	"addps 2048(%0, %%esi), %%xmm0	\n\t" 
	"addps 3072(%0, %%esi), %%xmm1	\n\t" 
	"addps %%xmm7, %%xmm0		\n\t"
	"addps 4096(%0, %%esi), %%xmm1	\n\t" 
	"addps %%xmm1, %%xmm0		\n\t"
	"movaps %%xmm0, (%0, %%esi)	\n\t"
	"addl $16, %%esi		\n\t"
	" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix3to2_SSE (sample_t * samples, sample_t bias)
{
	asm volatile(
	"movlps %1, %%xmm7		\n\t"
	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
	"movl $-1024, %%esi		\n\t"
	".balign 16\n\t"
	"1:				\n\t"
	"movaps 1024(%0, %%esi), %%xmm0	\n\t" 
	"addps %%xmm7, %%xmm0		\n\t" //common
	"movaps (%0, %%esi), %%xmm1	\n\t" 
	"movaps 2048(%0, %%esi), %%xmm2	\n\t"
	"addps %%xmm0, %%xmm1		\n\t"
	"addps %%xmm0, %%xmm2		\n\t"
	"movaps %%xmm1, (%0, %%esi)	\n\t"
	"movaps %%xmm2, 1024(%0, %%esi)	\n\t"
	"addl $16, %%esi		\n\t"
	" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix21to2_SSE (sample_t * left, sample_t * right, sample_t bias)
{
	asm volatile(
		"movlps %2, %%xmm7		\n\t"
		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
		"movl $-1024, %%esi		\n\t"
		".balign 16\n\t"
		"1:				\n\t"
		"movaps 1024(%1, %%esi), %%xmm0	\n\t" 
		"addps %%xmm7, %%xmm0		\n\t" //common
		"movaps (%0, %%esi), %%xmm1	\n\t" 
		"movaps (%1, %%esi), %%xmm2	\n\t"
		"addps %%xmm0, %%xmm1		\n\t"
		"addps %%xmm0, %%xmm2		\n\t"
		"movaps %%xmm1, (%0, %%esi)	\n\t"
		"movaps %%xmm2, (%1, %%esi)	\n\t"
		"addl $16, %%esi		\n\t"
		" jnz 1b			\n\t"
	:: "r" (left+256), "r" (right+256), "m" (bias)
	: "%esi"
	);
}

static void mix21toS_SSE (sample_t * samples, sample_t bias)
{
	asm volatile(
		"movlps %1, %%xmm7		\n\t"
		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
		"movl $-1024, %%esi		\n\t"
		".balign 16\n\t"
		"1:				\n\t"
		"movaps 2048(%0, %%esi), %%xmm0	\n\t"  // surround
		"movaps (%0, %%esi), %%xmm1	\n\t" 
		"movaps 1024(%0, %%esi), %%xmm2	\n\t"
		"addps %%xmm7, %%xmm1		\n\t"
		"addps %%xmm7, %%xmm2		\n\t"
		"subps %%xmm0, %%xmm1		\n\t"
		"addps %%xmm0, %%xmm2		\n\t"
		"movaps %%xmm1, (%0, %%esi)	\n\t"
		"movaps %%xmm2, 1024(%0, %%esi)	\n\t"
		"addl $16, %%esi		\n\t"
		" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix31to2_SSE (sample_t * samples, sample_t bias)
{
	asm volatile(
		"movlps %1, %%xmm7		\n\t"
		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
		"movl $-1024, %%esi		\n\t"
		".balign 16\n\t"
		"1:				\n\t"
		"movaps 1024(%0, %%esi), %%xmm0	\n\t"  
		"addps 3072(%0, %%esi), %%xmm0	\n\t"  
		"addps %%xmm7, %%xmm0		\n\t" // common
		"movaps (%0, %%esi), %%xmm1	\n\t" 
		"movaps 2048(%0, %%esi), %%xmm2	\n\t"
		"addps %%xmm0, %%xmm1		\n\t"
		"addps %%xmm0, %%xmm2		\n\t"
		"movaps %%xmm1, (%0, %%esi)	\n\t"
		"movaps %%xmm2, 1024(%0, %%esi)	\n\t"
		"addl $16, %%esi		\n\t"
		" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix31toS_SSE (sample_t * samples, sample_t bias)
{
	asm volatile(
		"movlps %1, %%xmm7		\n\t"
		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
		"movl $-1024, %%esi		\n\t"
		".balign 16\n\t"
		"1:				\n\t"
		"movaps 1024(%0, %%esi), %%xmm0	\n\t"  
		"movaps 3072(%0, %%esi), %%xmm3	\n\t" // surround
		"addps %%xmm7, %%xmm0		\n\t" // common
		"movaps (%0, %%esi), %%xmm1	\n\t" 
		"movaps 2048(%0, %%esi), %%xmm2	\n\t"
		"addps %%xmm0, %%xmm1		\n\t"
		"addps %%xmm0, %%xmm2		\n\t"
		"subps %%xmm3, %%xmm1		\n\t"
		"addps %%xmm3, %%xmm2		\n\t"
		"movaps %%xmm1, (%0, %%esi)	\n\t"
		"movaps %%xmm2, 1024(%0, %%esi)	\n\t"
		"addl $16, %%esi		\n\t"
		" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix22toS_SSE (sample_t * samples, sample_t bias)
{
	asm volatile(
		"movlps %1, %%xmm7		\n\t"
		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
		"movl $-1024, %%esi		\n\t"
		".balign 16\n\t"
		"1:				\n\t"
		"movaps 2048(%0, %%esi), %%xmm0	\n\t"  
		"addps 3072(%0, %%esi), %%xmm0	\n\t" // surround
		"movaps (%0, %%esi), %%xmm1	\n\t" 
		"movaps 1024(%0, %%esi), %%xmm2	\n\t"
		"addps %%xmm7, %%xmm1		\n\t"
		"addps %%xmm7, %%xmm2		\n\t"
		"subps %%xmm0, %%xmm1		\n\t"
		"addps %%xmm0, %%xmm2		\n\t"
		"movaps %%xmm1, (%0, %%esi)	\n\t"
		"movaps %%xmm2, 1024(%0, %%esi)	\n\t"
		"addl $16, %%esi		\n\t"
		" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix32to2_SSE (sample_t * samples, sample_t bias)
{
	asm volatile(
	"movlps %1, %%xmm7		\n\t"
	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
	"movl $-1024, %%esi		\n\t"
	".balign 16\n\t"
	"1:				\n\t"
	"movaps 1024(%0, %%esi), %%xmm0	\n\t" 
	"addps %%xmm7, %%xmm0		\n\t" // common
	"movaps %%xmm0, %%xmm1		\n\t" // common
	"addps (%0, %%esi), %%xmm0	\n\t" 
	"addps 2048(%0, %%esi), %%xmm1	\n\t" 
	"addps 3072(%0, %%esi), %%xmm0	\n\t" 
	"addps 4096(%0, %%esi), %%xmm1	\n\t" 
	"movaps %%xmm0, (%0, %%esi)	\n\t"
	"movaps %%xmm1, 1024(%0, %%esi)	\n\t"
	"addl $16, %%esi		\n\t"
	" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix32toS_SSE (sample_t * samples, sample_t bias)
{
	asm volatile(
	"movlps %1, %%xmm7		\n\t"
	"shufps $0x00, %%xmm7, %%xmm7	\n\t"
	"movl $-1024, %%esi		\n\t"
	".balign 16\n\t"
	"1:				\n\t"
	"movaps 1024(%0, %%esi), %%xmm0	\n\t" 
	"movaps 3072(%0, %%esi), %%xmm2	\n\t" 
	"addps %%xmm7, %%xmm0		\n\t" // common
	"addps 4096(%0, %%esi), %%xmm2	\n\t" // surround	
	"movaps (%0, %%esi), %%xmm1	\n\t" 
	"movaps 2048(%0, %%esi), %%xmm3	\n\t" 
	"subps %%xmm2, %%xmm1		\n\t"	
	"addps %%xmm2, %%xmm3		\n\t"	
	"addps %%xmm0, %%xmm1		\n\t"	
	"addps %%xmm0, %%xmm3		\n\t"	
	"movaps %%xmm1, (%0, %%esi)	\n\t"
	"movaps %%xmm3, 1024(%0, %%esi)	\n\t"
	"addl $16, %%esi		\n\t"
	" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void move2to1_SSE (sample_t * src, sample_t * dest, sample_t bias)
{
	asm volatile(
		"movlps %2, %%xmm7		\n\t"
		"shufps $0x00, %%xmm7, %%xmm7	\n\t"
		"movl $-1024, %%esi		\n\t"
		".balign 16\n\t"
		"1:				\n\t"
		"movaps (%0, %%esi), %%xmm0	\n\t"  
		"movaps 16(%0, %%esi), %%xmm1	\n\t"  
		"addps 1024(%0, %%esi), %%xmm0	\n\t"
		"addps 1040(%0, %%esi), %%xmm1	\n\t"
		"addps %%xmm7, %%xmm0		\n\t"
		"addps %%xmm7, %%xmm1		\n\t"
		"movaps %%xmm0, (%1, %%esi)	\n\t"
		"movaps %%xmm1, 16(%1, %%esi)	\n\t"
		"addl $32, %%esi		\n\t"
		" jnz 1b			\n\t"
	:: "r" (src+256), "r" (dest+256), "m" (bias)
	: "%esi"
	);
}

static void zero_MMX(sample_t * samples)
{
	asm volatile(
		"movl $-1024, %%esi		\n\t"
		"pxor %%mm0, %%mm0		\n\t"
		".balign 16\n\t"
		"1:				\n\t"
		"movq %%mm0, (%0, %%esi)	\n\t"
		"movq %%mm0, 8(%0, %%esi)	\n\t"
		"movq %%mm0, 16(%0, %%esi)	\n\t"
		"movq %%mm0, 24(%0, %%esi)	\n\t"
		"addl $32, %%esi		\n\t"
		" jnz 1b			\n\t"
		"emms"
	:: "r" (samples+256)
	: "%esi"
	);
}

/*
 I hope dest and src will be at least 8 byte aligned and size
 will devide on 8 without remain
 Note: untested and unused.
*/
static void copy_MMX(void *dest,const void *src,unsigned size)
{
  unsigned i;
  size /= 64;
	for(i=0;i<size;i++)
	{
	    __asm __volatile(
		"movq	%0,   %%mm0\n\t"
		"movq	8%0,  %%mm1\n\t"
		"movq	16%0, %%mm2\n\t"
		"movq	24%0, %%mm3\n\t"
		"movq	32%0, %%mm4\n\t"
		"movq	40%0, %%mm5\n\t"
		"movq	48%0, %%mm6\n\t"
		"movq	56%0, %%mm7\n\t"
		"movq	%%mm0, %1\n\t"
		"movq	%%mm1, 8%1\n\t"
		"movq	%%mm2, 16%1\n\t"
		"movq	%%mm3, 24%1\n\t"
		"movq	%%mm4, 32%1\n\t"
		"movq	%%mm5, 40%1\n\t"
		"movq	%%mm6, 48%1\n\t"
		"movq	%%mm7, 56%1\n\t"
		:
		:"m"(src),"m"(dest));
	}
}

static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias,
	      sample_t clev, sample_t slev)
{
    switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {

    case CONVERT (A52_CHANNEL, A52_CHANNEL2):
	memcpy (samples, samples + 256, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_CHANNEL, A52_MONO):
    case CONVERT (A52_STEREO, A52_MONO):
    mix_2to1_SSE:
	mix2to1_SSE (samples, samples + 256, bias);
	break;

    case CONVERT (A52_2F1R, A52_MONO):
	if (slev == 0)
	    goto mix_2to1_SSE;
    case CONVERT (A52_3F, A52_MONO):
    mix_3to1_SSE:
	mix3to1_SSE (samples, bias);
	break;

    case CONVERT (A52_3F1R, A52_MONO):
	if (slev == 0)
	    goto mix_3to1_SSE;
    case CONVERT (A52_2F2R, A52_MONO):
	if (slev == 0)
	    goto mix_2to1_SSE;
	mix4to1_SSE (samples, bias);
	break;

    case CONVERT (A52_3F2R, A52_MONO):
	if (slev == 0)
	    goto mix_3to1_SSE;
	mix5to1_SSE (samples, bias);
	break;

    case CONVERT (A52_MONO, A52_DOLBY):
	memcpy (samples + 256, samples, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_3F, A52_STEREO):
    case CONVERT (A52_3F, A52_DOLBY):
    mix_3to2_SSE:
	mix3to2_SSE (samples, bias);
	break;

    case CONVERT (A52_2F1R, A52_STEREO):
	if (slev == 0)
	    break;
	mix21to2_SSE (samples, samples + 256, bias);
	break;

    case CONVERT (A52_2F1R, A52_DOLBY):
	mix21toS_SSE (samples, bias);
	break;

    case CONVERT (A52_3F1R, A52_STEREO):
	if (slev == 0)
	    goto mix_3to2_SSE;
	mix31to2_SSE (samples, bias);
	break;

    case CONVERT (A52_3F1R, A52_DOLBY):
	mix31toS_SSE (samples, bias);
	break;

    case CONVERT (A52_2F2R, A52_STEREO):
	if (slev == 0)
	    break;
	mix2to1_SSE (samples, samples + 512, bias);
	mix2to1_SSE (samples + 256, samples + 768, bias);
	break;

    case CONVERT (A52_2F2R, A52_DOLBY):
	mix22toS_SSE (samples, bias);
	break;

    case CONVERT (A52_3F2R, A52_STEREO):
	if (slev == 0)
	    goto mix_3to2_SSE;
	mix32to2_SSE (samples, bias);
	break;

    case CONVERT (A52_3F2R, A52_DOLBY):
	mix32toS_SSE (samples, bias);
	break;

    case CONVERT (A52_3F1R, A52_3F):
	if (slev == 0)
	    break;
	mix21to2_SSE (samples, samples + 512, bias);
	break;

    case CONVERT (A52_3F2R, A52_3F):
	if (slev == 0)
	    break;
	mix2to1_SSE (samples, samples + 768, bias);
	mix2to1_SSE (samples + 512, samples + 1024, bias);
	break;

    case CONVERT (A52_3F1R, A52_2F1R):
	mix3to2_SSE (samples, bias);
	memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_2F2R, A52_2F1R):
	mix2to1_SSE (samples + 512, samples + 768, bias);
	break;

    case CONVERT (A52_3F2R, A52_2F1R):
	mix3to2_SSE (samples, bias); //FIXME possible bug? (output doesnt seem to be used)
	move2to1_SSE (samples + 768, samples + 512, bias);
	break;

    case CONVERT (A52_3F2R, A52_3F1R):
	mix2to1_SSE (samples + 768, samples + 1024, bias);
	break;

    case CONVERT (A52_2F1R, A52_2F2R):
	memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_3F1R, A52_2F2R):
	mix3to2_SSE (samples, bias);
	memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_3F2R, A52_2F2R):
	mix3to2_SSE (samples, bias);
	memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
	memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_3F1R, A52_3F2R):
	memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
	break;
    }
}

static void upmix_MMX (sample_t * samples, int acmod, int output)
{
    switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {

    case CONVERT (A52_CHANNEL, A52_CHANNEL2):
	memcpy (samples + 256, samples, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_3F2R, A52_MONO):
	zero_MMX (samples + 1024);
    case CONVERT (A52_3F1R, A52_MONO):
    case CONVERT (A52_2F2R, A52_MONO):
	zero_MMX (samples + 768);
    case CONVERT (A52_3F, A52_MONO):
    case CONVERT (A52_2F1R, A52_MONO):
	zero_MMX (samples + 512);
    case CONVERT (A52_CHANNEL, A52_MONO):
    case CONVERT (A52_STEREO, A52_MONO):
	zero_MMX (samples + 256);
	break;

    case CONVERT (A52_3F2R, A52_STEREO):
    case CONVERT (A52_3F2R, A52_DOLBY):
	zero_MMX (samples + 1024);
    case CONVERT (A52_3F1R, A52_STEREO):
    case CONVERT (A52_3F1R, A52_DOLBY):
	zero_MMX (samples + 768);
    case CONVERT (A52_3F, A52_STEREO):
    case CONVERT (A52_3F, A52_DOLBY):
    mix_3to2_MMX:
	memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t));
	zero_MMX (samples + 256);
	break;

    case CONVERT (A52_2F2R, A52_STEREO):
    case CONVERT (A52_2F2R, A52_DOLBY):
	zero_MMX (samples + 768);
    case CONVERT (A52_2F1R, A52_STEREO):
    case CONVERT (A52_2F1R, A52_DOLBY):
	zero_MMX (samples + 512);
	break;

    case CONVERT (A52_3F2R, A52_3F):
	zero_MMX (samples + 1024);
    case CONVERT (A52_3F1R, A52_3F):
    case CONVERT (A52_2F2R, A52_2F1R):
	zero_MMX (samples + 768);
	break;

    case CONVERT (A52_3F2R, A52_3F1R):
	zero_MMX (samples + 1024);
	break;

    case CONVERT (A52_3F2R, A52_2F1R):
	zero_MMX (samples + 1024);
    case CONVERT (A52_3F1R, A52_2F1R):
    mix_31to21_MMX:
	memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));
	goto mix_3to2_MMX;

    case CONVERT (A52_3F2R, A52_2F2R):
	memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
	goto mix_31to21_MMX;
    }
}

static void mix2to1_3dnow (sample_t * dest, sample_t * src, sample_t bias)
{
	asm volatile(
	"movd  %2, %%mm7	\n\t"
	"punpckldq %2, %%mm7	\n\t"
	"movl  $-1024, %%esi	\n\t"
	".balign 16\n\t"
	"1:			\n\t"
	"movq  (%0, %%esi), %%mm0	\n\t" 
	"movq  8(%0, %%esi), %%mm1	\n\t"
	"movq  16(%0, %%esi), %%mm2	\n\t" 
	"movq  24(%0, %%esi), %%mm3	\n\t"
	"pfadd (%1, %%esi), %%mm0	\n\t" 
	"pfadd 8(%1, %%esi), %%mm1	\n\t"
	"pfadd 16(%1, %%esi), %%mm2	\n\t" 
	"pfadd 24(%1, %%esi), %%mm3	\n\t"
	"pfadd %%mm7, %%mm0		\n\t"
	"pfadd %%mm7, %%mm1		\n\t"
	"pfadd %%mm7, %%mm2		\n\t"
	"pfadd %%mm7, %%mm3		\n\t"
	"movq  %%mm0, (%1, %%esi)	\n\t"
	"movq  %%mm1, 8(%1, %%esi)	\n\t"
	"movq  %%mm2, 16(%1, %%esi)	\n\t"
	"movq  %%mm3, 24(%1, %%esi)	\n\t"
	"addl $32, %%esi		\n\t"
	" jnz 1b			\n\t"
	:: "r" (src+256), "r" (dest+256), "m" (bias)
	: "%esi"
	);
}

static void mix3to1_3dnow (sample_t * samples, sample_t bias)
{
	asm volatile(
	"movd  %1, %%mm7	\n\t"
	"punpckldq %1, %%mm7	\n\t"
	"movl $-1024, %%esi	\n\t"
	".balign 16\n\t"
	"1:			\n\t"
	"movq  (%0, %%esi), %%mm0	\n\t" 
	"movq  8(%0, %%esi), %%mm1	\n\t"
	"movq  1024(%0, %%esi), %%mm2	\n\t" 
	"movq  1032(%0, %%esi), %%mm3	\n\t"
	"pfadd 2048(%0, %%esi), %%mm0	\n\t" 
	"pfadd 2056(%0, %%esi), %%mm1	\n\t"
	"pfadd %%mm7, %%mm0		\n\t"
	"pfadd %%mm7, %%mm1		\n\t"
	"pfadd %%mm2, %%mm0		\n\t"
	"pfadd %%mm3, %%mm1		\n\t"
	"movq  %%mm0, (%0, %%esi)	\n\t"
	"movq  %%mm1, 8(%0, %%esi)	\n\t"
	"addl $16, %%esi		\n\t"
	" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix4to1_3dnow (sample_t * samples, sample_t bias)
{
	asm volatile(
	"movd  %1, %%mm7	\n\t"
	"punpckldq %1, %%mm7	\n\t"
	"movl $-1024, %%esi	\n\t"
	".balign 16\n\t"
	"1:			\n\t"
	"movq  (%0, %%esi), %%mm0	\n\t" 
	"movq  8(%0, %%esi), %%mm1	\n\t"
	"movq  1024(%0, %%esi), %%mm2	\n\t" 
	"movq  1032(%0, %%esi), %%mm3	\n\t"
	"pfadd 2048(%0, %%esi), %%mm0	\n\t" 
	"pfadd 2056(%0, %%esi), %%mm1	\n\t"
	"pfadd 3072(%0, %%esi), %%mm2	\n\t" 
	"pfadd 3080(%0, %%esi), %%mm3	\n\t"
	"pfadd %%mm7, %%mm0		\n\t"
	"pfadd %%mm7, %%mm1		\n\t"
	"pfadd %%mm2, %%mm0		\n\t"
	"pfadd %%mm3, %%mm1		\n\t"
	"movq  %%mm0, (%0, %%esi)	\n\t"
	"movq  %%mm1, 8(%0, %%esi)	\n\t"
	"addl $16, %%esi		\n\t"
	" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix5to1_3dnow (sample_t * samples, sample_t bias)
{
	asm volatile(
	"movd  %1, %%mm7	\n\t"
	"punpckldq %1, %%mm7	\n\t"
	"movl $-1024, %%esi	\n\t"
	".balign 16\n\t"
	"1:			\n\t"
	"movq  (%0, %%esi), %%mm0	\n\t" 
	"movq  8(%0, %%esi), %%mm1	\n\t"
	"movq  1024(%0, %%esi), %%mm2	\n\t" 
	"movq  1032(%0, %%esi), %%mm3	\n\t"
	"pfadd 2048(%0, %%esi), %%mm0	\n\t" 
	"pfadd 2056(%0, %%esi), %%mm1	\n\t"
	"pfadd 3072(%0, %%esi), %%mm2	\n\t" 
	"pfadd 3080(%0, %%esi), %%mm3	\n\t"
	"pfadd %%mm7, %%mm0		\n\t"
	"pfadd %%mm7, %%mm1		\n\t"
	"pfadd 4096(%0, %%esi), %%mm2	\n\t" 
	"pfadd 4104(%0, %%esi), %%mm3	\n\t"
	"pfadd %%mm2, %%mm0		\n\t"
	"pfadd %%mm3, %%mm1		\n\t"
	"movq  %%mm0, (%0, %%esi)	\n\t"
	"movq  %%mm1, 8(%0, %%esi)	\n\t"
	"addl $16, %%esi		\n\t"
	" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix3to2_3dnow (sample_t * samples, sample_t bias)
{
	asm volatile(
	"movd  %1, %%mm7	\n\t"
	"punpckldq %1, %%mm7	\n\t"
	"movl $-1024, %%esi	\n\t"
	".balign 16\n\t"
	"1:			\n\t"
	"movq   1024(%0, %%esi), %%mm0	\n\t" 
	"movq   1032(%0, %%esi), %%mm1	\n\t"
	"pfadd  %%mm7, %%mm0		\n\t" //common
	"pfadd  %%mm7, %%mm1		\n\t" //common
	"movq   (%0, %%esi), %%mm2	\n\t" 
	"movq   8(%0, %%esi), %%mm3	\n\t"
	"movq   2048(%0, %%esi), %%mm4	\n\t"
	"movq   2056(%0, %%esi), %%mm5	\n\t"
	"pfadd  %%mm0, %%mm2		\n\t"
	"pfadd  %%mm1, %%mm3		\n\t"
	"pfadd  %%mm0, %%mm4		\n\t"
	"pfadd  %%mm1, %%mm5		\n\t"
	"movq   %%mm2, (%0, %%esi)	\n\t"
	"movq   %%mm3, 8(%0, %%esi)	\n\t"
	"movq   %%mm4, 1024(%0, %%esi)	\n\t"
	"movq   %%mm5, 1032(%0, %%esi)	\n\t"
	"addl $16, %%esi		\n\t"
	" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix21to2_3dnow (sample_t * left, sample_t * right, sample_t bias)
{
	asm volatile(
		"movd  %2, %%mm7	\n\t"
		"punpckldq %2, %%mm7	\n\t"
		"movl $-1024, %%esi	\n\t"
		".balign 16\n\t"
		"1:			\n\t"
		"movq  1024(%1, %%esi), %%mm0	\n\t" 
		"movq  1032(%1, %%esi), %%mm1	\n\t"
		"pfadd %%mm7, %%mm0		\n\t" //common
		"pfadd %%mm7, %%mm1		\n\t" //common
		"movq  (%0, %%esi), %%mm2	\n\t" 
		"movq  8(%0, %%esi), %%mm3	\n\t"
		"movq  (%1, %%esi), %%mm4	\n\t"
		"movq  8(%1, %%esi), %%mm5	\n\t"
		"pfadd %%mm0, %%mm2		\n\t"
		"pfadd %%mm1, %%mm3		\n\t"
		"pfadd %%mm0, %%mm4		\n\t"
		"pfadd %%mm1, %%mm5		\n\t"
		"movq  %%mm2, (%0, %%esi)	\n\t"
		"movq  %%mm3, 8(%0, %%esi)	\n\t"
		"movq  %%mm4, (%1, %%esi)	\n\t"
		"movq  %%mm5, 8(%1, %%esi)	\n\t"
		"addl $16, %%esi		\n\t"
		" jnz 1b			\n\t"
	:: "r" (left+256), "r" (right+256), "m" (bias)
	: "%esi"
	);
}

static void mix21toS_3dnow (sample_t * samples, sample_t bias)
{
	asm volatile(
		"movd  %1, %%mm7	\n\t"
		"punpckldq %1, %%mm7	\n\t"
		"movl $-1024, %%esi	\n\t"
		".balign 16\n\t"
		"1:			\n\t"
		"movq  2048(%0, %%esi), %%mm0	\n\t"  // surround
		"movq  2056(%0, %%esi), %%mm1	\n\t"  // surround
		"movq  (%0, %%esi), %%mm2	\n\t" 
		"movq  8(%0, %%esi), %%mm3	\n\t"
		"movq  1024(%0, %%esi), %%mm4	\n\t"
		"movq  1032(%0, %%esi), %%mm5	\n\t"
		"pfadd %%mm7, %%mm2		\n\t"
		"pfadd %%mm7, %%mm3		\n\t"
		"pfadd %%mm7, %%mm4		\n\t"
		"pfadd %%mm7, %%mm5		\n\t"
		"pfsub %%mm0, %%mm2		\n\t"
		"pfsub %%mm1, %%mm3		\n\t"
		"pfadd %%mm0, %%mm4		\n\t"
		"pfadd %%mm1, %%mm5		\n\t"
		"movq  %%mm2, (%0, %%esi)	\n\t"
		"movq  %%mm3, 8(%0, %%esi)	\n\t"
		"movq  %%mm4, 1024(%0, %%esi)	\n\t"
		"movq  %%mm5, 1032(%0, %%esi)	\n\t"
		"addl $16, %%esi		\n\t"
		" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix31to2_3dnow (sample_t * samples, sample_t bias)
{
	asm volatile(
		"movd  %1, %%mm7	\n\t"
		"punpckldq %1, %%mm7	\n\t"
		"movl $-1024, %%esi	\n\t"
		".balign 16\n\t"
		"1:			\n\t"
		"movq  1024(%0, %%esi), %%mm0	\n\t"  
		"movq  1032(%0, %%esi), %%mm1	\n\t"
		"pfadd 3072(%0, %%esi), %%mm0	\n\t"  
		"pfadd 3080(%0, %%esi), %%mm1	\n\t"
		"pfadd %%mm7, %%mm0		\n\t" // common
		"pfadd %%mm7, %%mm1		\n\t" // common
		"movq  (%0, %%esi), %%mm2	\n\t" 
		"movq  8(%0, %%esi), %%mm3	\n\t"
		"movq  2048(%0, %%esi), %%mm4	\n\t"
		"movq  2056(%0, %%esi), %%mm5	\n\t"
		"pfadd %%mm0, %%mm2		\n\t"
		"pfadd %%mm1, %%mm3		\n\t"
		"pfadd %%mm0, %%mm4		\n\t"
		"pfadd %%mm1, %%mm5		\n\t"
		"movq  %%mm2, (%0, %%esi)	\n\t"
		"movq  %%mm3, 8(%0, %%esi)	\n\t"
		"movq  %%mm4, 1024(%0, %%esi)	\n\t"
		"movq  %%mm5, 1032(%0, %%esi)	\n\t"
		"addl $16, %%esi		\n\t"
		" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix31toS_3dnow (sample_t * samples, sample_t bias)
{
	asm volatile(
		"movd  %1, %%mm7	\n\t"
		"punpckldq %1, %%mm7	\n\t"
		"movl $-1024, %%esi	\n\t"
		".balign 16\n\t"
		"1:			\n\t"
		"movq   1024(%0, %%esi), %%mm0	\n\t"  
		"movq   1032(%0, %%esi), %%mm1	\n\t"
		"pfadd  %%mm7, %%mm0		\n\t" // common
		"pfadd  %%mm7, %%mm1		\n\t" // common
		"movq   (%0, %%esi), %%mm2	\n\t" 
		"movq   8(%0, %%esi), %%mm3	\n\t"
		"movq   2048(%0, %%esi), %%mm4	\n\t"
		"movq   2056(%0, %%esi), %%mm5	\n\t"
		"pfadd  %%mm0, %%mm2		\n\t"
		"pfadd  %%mm1, %%mm3		\n\t"
		"pfadd  %%mm0, %%mm4		\n\t"
		"pfadd  %%mm1, %%mm5		\n\t"
		"movq   3072(%0, %%esi), %%mm0	\n\t" // surround
		"movq   3080(%0, %%esi), %%mm1	\n\t" // surround
		"pfsub  %%mm0, %%mm2		\n\t"
		"pfsub  %%mm1, %%mm3		\n\t"
		"pfadd  %%mm0, %%mm4		\n\t"
		"pfadd  %%mm1, %%mm5		\n\t"
		"movq   %%mm2, (%0, %%esi)	\n\t"
		"movq   %%mm3, 8(%0, %%esi)	\n\t"
		"movq   %%mm4, 1024(%0, %%esi)	\n\t"
		"movq   %%mm5, 1032(%0, %%esi)	\n\t"
		"addl $16, %%esi		\n\t"
		" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix22toS_3dnow (sample_t * samples, sample_t bias)
{
	asm volatile(
		"movd  %1, %%mm7	\n\t"
		"punpckldq %1, %%mm7	\n\t"
		"movl $-1024, %%esi	\n\t"
		".balign 16\n\t"
		"1:			\n\t"
		"movq  2048(%0, %%esi), %%mm0	\n\t"  
		"movq  2056(%0, %%esi), %%mm1	\n\t"
		"pfadd 3072(%0, %%esi), %%mm0	\n\t" // surround
		"pfadd 3080(%0, %%esi), %%mm1	\n\t" // surround
		"movq  (%0, %%esi), %%mm2	\n\t" 
		"movq  8(%0, %%esi), %%mm3	\n\t"
		"movq  1024(%0, %%esi), %%mm4	\n\t"
		"movq  1032(%0, %%esi), %%mm5	\n\t"
		"pfadd %%mm7, %%mm2		\n\t"
		"pfadd %%mm7, %%mm3		\n\t"
		"pfadd %%mm7, %%mm4		\n\t"
		"pfadd %%mm7, %%mm5		\n\t"
		"pfsub %%mm0, %%mm2		\n\t"
		"pfsub %%mm1, %%mm3		\n\t"
		"pfadd %%mm0, %%mm4		\n\t"
		"pfadd %%mm1, %%mm5		\n\t"
		"movq  %%mm2, (%0, %%esi)	\n\t"
		"movq  %%mm3, 8(%0, %%esi)	\n\t"
		"movq  %%mm4, 1024(%0, %%esi)	\n\t"
		"movq  %%mm5, 1032(%0, %%esi)	\n\t"
		"addl $16, %%esi		\n\t"
		" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void mix32to2_3dnow (sample_t * samples, sample_t bias)
{
	asm volatile(
	"movd  %1, %%mm7	\n\t"
	"punpckldq %1, %%mm7	\n\t"
	"movl $-1024, %%esi	\n\t"
	".balign 16\n\t"
	"1:			\n\t"
	"movq   1024(%0, %%esi), %%mm0	\n\t" 
	"movq   1032(%0, %%esi), %%mm1	\n\t"
	"pfadd  %%mm7, %%mm0		\n\t" // common
	"pfadd  %%mm7, %%mm1		\n\t" // common
	"movq   %%mm0, %%mm2		\n\t" // common
	"movq   %%mm1, %%mm3		\n\t" // common
	"pfadd  (%0, %%esi), %%mm0	\n\t" 
	"pfadd  8(%0, %%esi), %%mm1	\n\t"
	"pfadd  2048(%0, %%esi), %%mm2	\n\t" 
	"pfadd  2056(%0, %%esi), %%mm3	\n\t"
	"pfadd  3072(%0, %%esi), %%mm0	\n\t" 
	"pfadd  3080(%0, %%esi), %%mm1	\n\t"
	"pfadd  4096(%0, %%esi), %%mm2	\n\t" 
	"pfadd  4104(%0, %%esi), %%mm3	\n\t"
	"movq   %%mm0, (%0, %%esi)	\n\t"
	"movq   %%mm1, 8(%0, %%esi)	\n\t"
	"movq   %%mm2, 1024(%0, %%esi)	\n\t"
	"movq   %%mm3, 1032(%0, %%esi)	\n\t"
	"addl $16, %%esi		\n\t"
	" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

/* todo: should be optimized better */
static void mix32toS_3dnow (sample_t * samples, sample_t bias)
{
	asm volatile(
	"movl $-1024, %%esi	\n\t"
	".balign 16\n\t"
	"1:			\n\t"
	"movd  %1, %%mm7		\n\t"
	"punpckldq %1, %%mm7		\n\t"
	"movq  1024(%0, %%esi), %%mm0	\n\t" 
	"movq  1032(%0, %%esi), %%mm1	\n\t"
	"movq  3072(%0, %%esi), %%mm4	\n\t" 
	"movq  3080(%0, %%esi), %%mm5	\n\t"
	"pfadd %%mm7, %%mm0		\n\t" // common
	"pfadd %%mm7, %%mm1		\n\t" // common
	"pfadd 4096(%0, %%esi), %%mm4	\n\t" // surround	
	"pfadd 4104(%0, %%esi), %%mm5	\n\t" // surround
	"movq  (%0, %%esi), %%mm2	\n\t" 
	"movq  8(%0, %%esi), %%mm3	\n\t"
	"movq  2048(%0, %%esi), %%mm6	\n\t" 
	"movq  2056(%0, %%esi), %%mm7	\n\t"
	"pfsub %%mm4, %%mm2		\n\t"	
	"pfsub %%mm5, %%mm3		\n\t"
	"pfadd %%mm4, %%mm6		\n\t"	
	"pfadd %%mm5, %%mm7		\n\t"
	"pfadd %%mm0, %%mm2		\n\t"	
	"pfadd %%mm1, %%mm3		\n\t"
	"pfadd %%mm0, %%mm6		\n\t"	
	"pfadd %%mm1, %%mm7		\n\t"
	"movq  %%mm2, (%0, %%esi)	\n\t"
	"movq  %%mm3, 8(%0, %%esi)	\n\t"
	"movq  %%mm6, 1024(%0, %%esi)	\n\t"
	"movq  %%mm7, 1032(%0, %%esi)	\n\t"
	"addl $16, %%esi		\n\t"
	" jnz 1b			\n\t"
	:: "r" (samples+256), "m" (bias)
	: "%esi"
	);
}

static void move2to1_3dnow (sample_t * src, sample_t * dest, sample_t bias)
{
	asm volatile(
		"movd  %2, %%mm7	\n\t"
		"punpckldq %2, %%mm7	\n\t"
		"movl $-1024, %%esi	\n\t"
		".balign 16\n\t"
		"1:			\n\t"
		"movq  (%0, %%esi), %%mm0	\n\t"  
		"movq  8(%0, %%esi), %%mm1	\n\t"
		"movq  16(%0, %%esi), %%mm2	\n\t"  
		"movq  24(%0, %%esi), %%mm3	\n\t"
		"pfadd 1024(%0, %%esi), %%mm0	\n\t"
		"pfadd 1032(%0, %%esi), %%mm1	\n\t"
		"pfadd 1040(%0, %%esi), %%mm2	\n\t"
		"pfadd 1048(%0, %%esi), %%mm3	\n\t"
		"pfadd %%mm7, %%mm0		\n\t"
		"pfadd %%mm7, %%mm1		\n\t"
		"pfadd %%mm7, %%mm2		\n\t"
		"pfadd %%mm7, %%mm3		\n\t"
		"movq  %%mm0, (%1, %%esi)	\n\t"
		"movq  %%mm1, 8(%1, %%esi)	\n\t"
		"movq  %%mm2, 16(%1, %%esi)	\n\t"
		"movq  %%mm3, 24(%1, %%esi)	\n\t"
		"addl $32, %%esi		\n\t"
		" jnz 1b			\n\t"
	:: "r" (src+256), "r" (dest+256), "m" (bias)
	: "%esi"
	);
}

static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias,
	      sample_t clev, sample_t slev)
{
    switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {

    case CONVERT (A52_CHANNEL, A52_CHANNEL2):
	memcpy (samples, samples + 256, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_CHANNEL, A52_MONO):
    case CONVERT (A52_STEREO, A52_MONO):
    mix_2to1_3dnow:
	mix2to1_3dnow (samples, samples + 256, bias);
	break;

    case CONVERT (A52_2F1R, A52_MONO):
	if (slev == 0)
	    goto mix_2to1_3dnow;
    case CONVERT (A52_3F, A52_MONO):
    mix_3to1_3dnow:
	mix3to1_3dnow (samples, bias);
	break;

    case CONVERT (A52_3F1R, A52_MONO):
	if (slev == 0)
	    goto mix_3to1_3dnow;
    case CONVERT (A52_2F2R, A52_MONO):
	if (slev == 0)
	    goto mix_2to1_3dnow;
	mix4to1_3dnow (samples, bias);
	break;

    case CONVERT (A52_3F2R, A52_MONO):
	if (slev == 0)
	    goto mix_3to1_3dnow;
	mix5to1_3dnow (samples, bias);
	break;

    case CONVERT (A52_MONO, A52_DOLBY):
	memcpy (samples + 256, samples, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_3F, A52_STEREO):
    case CONVERT (A52_3F, A52_DOLBY):
    mix_3to2_3dnow:
	mix3to2_3dnow (samples, bias);
	break;

    case CONVERT (A52_2F1R, A52_STEREO):
	if (slev == 0)
	    break;
	mix21to2_3dnow (samples, samples + 256, bias);
	break;

    case CONVERT (A52_2F1R, A52_DOLBY):
	mix21toS_3dnow (samples, bias);
	break;

    case CONVERT (A52_3F1R, A52_STEREO):
	if (slev == 0)
	    goto mix_3to2_3dnow;
	mix31to2_3dnow (samples, bias);
	break;

    case CONVERT (A52_3F1R, A52_DOLBY):
	mix31toS_3dnow (samples, bias);
	break;

    case CONVERT (A52_2F2R, A52_STEREO):
	if (slev == 0)
	    break;
	mix2to1_3dnow (samples, samples + 512, bias);
	mix2to1_3dnow (samples + 256, samples + 768, bias);
	break;

    case CONVERT (A52_2F2R, A52_DOLBY):
	mix22toS_3dnow (samples, bias);
	break;

    case CONVERT (A52_3F2R, A52_STEREO):
	if (slev == 0)
	    goto mix_3to2_3dnow;
	mix32to2_3dnow (samples, bias);
	break;

    case CONVERT (A52_3F2R, A52_DOLBY):
	mix32toS_3dnow (samples, bias);
	break;

    case CONVERT (A52_3F1R, A52_3F):
	if (slev == 0)
	    break;
	mix21to2_3dnow (samples, samples + 512, bias);
	break;

    case CONVERT (A52_3F2R, A52_3F):
	if (slev == 0)
	    break;
	mix2to1_3dnow (samples, samples + 768, bias);
	mix2to1_3dnow (samples + 512, samples + 1024, bias);
	break;

    case CONVERT (A52_3F1R, A52_2F1R):
	mix3to2_3dnow (samples, bias);
	memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_2F2R, A52_2F1R):
	mix2to1_3dnow (samples + 512, samples + 768, bias);
	break;

    case CONVERT (A52_3F2R, A52_2F1R):
	mix3to2_3dnow (samples, bias); //FIXME possible bug? (output doesnt seem to be used)
	move2to1_3dnow (samples + 768, samples + 512, bias);
	break;

    case CONVERT (A52_3F2R, A52_3F1R):
	mix2to1_3dnow (samples + 768, samples + 1024, bias);
	break;

    case CONVERT (A52_2F1R, A52_2F2R):
	memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_3F1R, A52_2F2R):
	mix3to2_3dnow (samples, bias);
	memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_3F2R, A52_2F2R):
	mix3to2_3dnow (samples, bias);
	memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
	memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t));
	break;

    case CONVERT (A52_3F1R, A52_3F2R):
	memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
	break;
    }
    __asm __volatile("femms":::"memory");
}

#endif //ARCH_X86