view libmpeg2/motion_comp_vis.c @ 34690:eeb74ce15120

Fix bug with playlist playback. When <stop> is pressed and you shift to another track and press <play>, the track is played twice (before advancing to the next element in the list). Reported by drake ch, drake.ch hotmail com. When shifting to another track, set information that at the end of playback of the current track the next track to be played shall not be determined (!uiGotoTheNext) only if currently playing. That is because the track shifting function itself (which already has determined the next track to be played) will end the playback of the current track (and start the next one) only if currently playing. If not playing, the next track to be played after the end of playback of the current track (determined by the track shifting function) must be determined again (uiGotoTheNext).
author ib
date Mon, 27 Feb 2012 22:09:34 +0000
parents 25337a2147e7
children
line wrap: on
line source

/*
 * motion_comp_vis.c
 * Copyright (C) 2003 David S. Miller <davem@redhat.com>
 *
 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
 * See http://libmpeg2.sourceforge.net/ for updates.
 *
 * mpeg2dec is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * mpeg2dec is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include "config.h"

#if ARCH_SPARC

#include <inttypes.h>

#include "mpeg2.h"
#include "attributes.h"
#include "mpeg2_internal.h"
#include "vis.h"

/* The trick used in some of this file is the formula from the MMX
 * motion comp code, which is:
 *
 * (x+y+1)>>1 == (x|y)-((x^y)>>1)
 *
 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
 * We avoid overflows by masking before we do the shift, and we
 * implement the shift by multiplying by 1/2 using mul8x16.  So in
 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
 * the value 0x80808080 is in f8):
 *
 *	fxor		f0, f2, f10
 *	fand		f10, f4, f10
 *	fmul8x16	f8, f10, f10
 *	fand		f10, f6, f10
 *	for		f0, f2, f12
 *	fpsub16		f12, f10, f10
 */

#define DUP4(x) {x, x, x, x}
#define DUP8(x) {x, x, x, x, x, x, x, x}
static const int16_t constants1[] ATTR_ALIGN(8) = DUP4 (1);
static const int16_t constants2[] ATTR_ALIGN(8) = DUP4 (2);
static const int16_t constants3[] ATTR_ALIGN(8) = DUP4 (3);
static const int16_t constants6[] ATTR_ALIGN(8) = DUP4 (6);
static const int8_t constants_fe[] ATTR_ALIGN(8) = DUP8 (0xfe);
static const int8_t constants_7f[] ATTR_ALIGN(8) = DUP8 (0x7f);
static const int8_t constants128[] ATTR_ALIGN(8) = DUP8 (128);
static const int16_t constants256_512[] ATTR_ALIGN(8) =
	{256, 512, 256, 512};
static const int16_t constants256_1024[] ATTR_ALIGN(8) =
	{256, 1024, 256, 1024};

#define REF_0		0
#define REF_0_1		1
#define REF_2		2
#define REF_2_1		3
#define REF_4		4
#define REF_4_1		5
#define REF_6		6
#define REF_6_1		7
#define REF_S0		8
#define REF_S0_1	9
#define REF_S2		10
#define REF_S2_1	11
#define REF_S4		12
#define REF_S4_1	13
#define REF_S6		14
#define REF_S6_1	15
#define DST_0		16
#define DST_1		17
#define DST_2		18
#define DST_3		19
#define CONST_1		20
#define CONST_2		20
#define CONST_3		20
#define CONST_6		20
#define MASK_fe		20
#define CONST_128	22
#define CONST_256	22
#define CONST_512	22
#define CONST_1024	22
#define TMP0		24
#define TMP1		25
#define TMP2		26
#define TMP3		27
#define TMP4		28
#define TMP5		29
#define ZERO		30
#define MASK_7f		30

#define TMP6		32
#define TMP8		34
#define TMP10		36
#define TMP12		38
#define TMP14		40
#define TMP16		42
#define TMP18		44
#define TMP20		46
#define TMP22		48
#define TMP24		50
#define TMP26		52
#define TMP28		54
#define TMP30		56
#define TMP32		58

static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * _ref,
			     const int stride, int height)
{
	uint8_t *ref = (uint8_t *) _ref;
	int offset;

	ref = vis_alignaddr(ref);
	offset = (ref != _ref) ? 16 : 0;
	do {	/* 5 cycles */
		vis_ld64(ref[0], TMP0);

		vis_ld64_2(ref, 8, TMP2);

		vis_ld64_2(ref, offset, TMP4);
		ref += stride;

		vis_faligndata(TMP0, TMP2, REF_0);
		vis_st64(REF_0, dest[0]);

		vis_faligndata(TMP2, TMP4, REF_2);
		vis_st64_2(REF_2, dest, 8);
		dest += stride;
	} while (--height);
}

static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * _ref,
			    const int stride, int height)
{
	uint8_t *ref = (uint8_t *) _ref;
	int offset;

	ref = vis_alignaddr(ref);
	offset = (ref != _ref) ? 8 : 0;
	do {	/* 4 cycles */
		vis_ld64(ref[0], TMP0);

		vis_ld64_2(ref, offset, TMP2);
		ref += stride;

		/* stall */

		vis_faligndata(TMP0, TMP2, REF_0);
		vis_st64(REF_0, dest[0]);
		dest += stride;
	} while (--height);
}


static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * _ref,
			     const int stride, int height)
{
	uint8_t *ref = (uint8_t *) _ref;
	int stride_8 = stride + 8;
	int offset;

	ref = vis_alignaddr(ref);
	offset = (ref != _ref) ? 16 : 0;

	vis_ld64(ref[0], TMP0);

	vis_ld64(ref[8], TMP2);

	vis_ld64_2(ref, offset, TMP4);

	vis_ld64(dest[0], DST_0);

	vis_ld64(dest[8], DST_2);

	vis_ld64(constants_fe[0], MASK_fe);
	vis_faligndata(TMP0, TMP2, REF_0);

	vis_ld64(constants_7f[0], MASK_7f);
	vis_faligndata(TMP2, TMP4, REF_2);

	vis_ld64(constants128[0], CONST_128);

	ref += stride;
	height = (height >> 1) - 1;

	do {	/* 24 cycles */
		vis_ld64(ref[0], TMP0);
		vis_xor(DST_0, REF_0, TMP6);

		vis_ld64_2(ref, 8, TMP2);
		vis_and(TMP6, MASK_fe, TMP6);

		vis_ld64_2(ref, offset, TMP4);
		ref += stride;
		vis_mul8x16(CONST_128, TMP6, TMP6);
		vis_xor(DST_2, REF_2, TMP8);

		vis_and(TMP8, MASK_fe, TMP8);

		vis_or(DST_0, REF_0, TMP10);
		vis_ld64_2(dest, stride, DST_0);
		vis_mul8x16(CONST_128, TMP8, TMP8);

		vis_or(DST_2, REF_2, TMP12);
		vis_ld64_2(dest, stride_8, DST_2);

		vis_ld64(ref[0], TMP14);
		vis_and(TMP6, MASK_7f, TMP6);

		vis_and(TMP8, MASK_7f, TMP8);

		vis_psub16(TMP10, TMP6, TMP6);
		vis_st64(TMP6, dest[0]);

		vis_psub16(TMP12, TMP8, TMP8);
		vis_st64_2(TMP8, dest, 8);

		dest += stride;
		vis_ld64_2(ref, 8, TMP16);
		vis_faligndata(TMP0, TMP2, REF_0);

		vis_ld64_2(ref, offset, TMP18);
		vis_faligndata(TMP2, TMP4, REF_2);
		ref += stride;

		vis_xor(DST_0, REF_0, TMP20);

		vis_and(TMP20, MASK_fe, TMP20);

		vis_xor(DST_2, REF_2, TMP22);
		vis_mul8x16(CONST_128, TMP20, TMP20);

		vis_and(TMP22, MASK_fe, TMP22);

		vis_or(DST_0, REF_0, TMP24);
		vis_mul8x16(CONST_128, TMP22, TMP22);

		vis_or(DST_2, REF_2, TMP26);

		vis_ld64_2(dest, stride, DST_0);
		vis_faligndata(TMP14, TMP16, REF_0);

		vis_ld64_2(dest, stride_8, DST_2);
		vis_faligndata(TMP16, TMP18, REF_2);

		vis_and(TMP20, MASK_7f, TMP20);

		vis_and(TMP22, MASK_7f, TMP22);

		vis_psub16(TMP24, TMP20, TMP20);
		vis_st64(TMP20, dest[0]);

		vis_psub16(TMP26, TMP22, TMP22);
		vis_st64_2(TMP22, dest, 8);
		dest += stride;
	} while (--height);

	vis_ld64(ref[0], TMP0);
	vis_xor(DST_0, REF_0, TMP6);

	vis_ld64_2(ref, 8, TMP2);
	vis_and(TMP6, MASK_fe, TMP6);

	vis_ld64_2(ref, offset, TMP4);
	vis_mul8x16(CONST_128, TMP6, TMP6);
	vis_xor(DST_2, REF_2, TMP8);

	vis_and(TMP8, MASK_fe, TMP8);

	vis_or(DST_0, REF_0, TMP10);
	vis_ld64_2(dest, stride, DST_0);
	vis_mul8x16(CONST_128, TMP8, TMP8);

	vis_or(DST_2, REF_2, TMP12);
	vis_ld64_2(dest, stride_8, DST_2);

	vis_ld64(ref[0], TMP14);
	vis_and(TMP6, MASK_7f, TMP6);

	vis_and(TMP8, MASK_7f, TMP8);

	vis_psub16(TMP10, TMP6, TMP6);
	vis_st64(TMP6, dest[0]);

	vis_psub16(TMP12, TMP8, TMP8);
	vis_st64_2(TMP8, dest, 8);

	dest += stride;
	vis_faligndata(TMP0, TMP2, REF_0);

	vis_faligndata(TMP2, TMP4, REF_2);

	vis_xor(DST_0, REF_0, TMP20);

	vis_and(TMP20, MASK_fe, TMP20);

	vis_xor(DST_2, REF_2, TMP22);
	vis_mul8x16(CONST_128, TMP20, TMP20);

	vis_and(TMP22, MASK_fe, TMP22);

	vis_or(DST_0, REF_0, TMP24);
	vis_mul8x16(CONST_128, TMP22, TMP22);

	vis_or(DST_2, REF_2, TMP26);

	vis_and(TMP20, MASK_7f, TMP20);

	vis_and(TMP22, MASK_7f, TMP22);

	vis_psub16(TMP24, TMP20, TMP20);
	vis_st64(TMP20, dest[0]);

	vis_psub16(TMP26, TMP22, TMP22);
	vis_st64_2(TMP22, dest, 8);
}

static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * _ref,
			    const int stride, int height)
{
	uint8_t *ref = (uint8_t *) _ref;
	int offset;

	ref = vis_alignaddr(ref);
	offset = (ref != _ref) ? 8 : 0;

	vis_ld64(ref[0], TMP0);

	vis_ld64_2(ref, offset, TMP2);

	vis_ld64(dest[0], DST_0);

	vis_ld64(constants_fe[0], MASK_fe);

	vis_ld64(constants_7f[0], MASK_7f);
	vis_faligndata(TMP0, TMP2, REF_0);

	vis_ld64(constants128[0], CONST_128);

	ref += stride;
	height = (height >> 1) - 1;

	do {	/* 12 cycles */
		vis_ld64(ref[0], TMP0);
		vis_xor(DST_0, REF_0, TMP4);

		vis_ld64_2(ref, offset, TMP2);
		vis_and(TMP4, MASK_fe, TMP4);

		vis_or(DST_0, REF_0, TMP6);
		vis_ld64_2(dest, stride, DST_0);
		ref += stride;
		vis_mul8x16(CONST_128, TMP4, TMP4);

		vis_ld64(ref[0], TMP12);
		vis_faligndata(TMP0, TMP2, REF_0);

		vis_ld64_2(ref, offset, TMP2);
		vis_xor(DST_0, REF_0, TMP0);
		ref += stride;

		vis_and(TMP0, MASK_fe, TMP0);

		vis_and(TMP4, MASK_7f, TMP4);

		vis_psub16(TMP6, TMP4, TMP4);
		vis_st64(TMP4, dest[0]);
		dest += stride;
		vis_mul8x16(CONST_128, TMP0, TMP0);

		vis_or(DST_0, REF_0, TMP6);
		vis_ld64_2(dest, stride, DST_0);

		vis_faligndata(TMP12, TMP2, REF_0);

		vis_and(TMP0, MASK_7f, TMP0);

		vis_psub16(TMP6, TMP0, TMP4);
		vis_st64(TMP4, dest[0]);
		dest += stride;
	} while (--height);

	vis_ld64(ref[0], TMP0);
	vis_xor(DST_0, REF_0, TMP4);

	vis_ld64_2(ref, offset, TMP2);
	vis_and(TMP4, MASK_fe, TMP4);

	vis_or(DST_0, REF_0, TMP6);
	vis_ld64_2(dest, stride, DST_0);
	vis_mul8x16(CONST_128, TMP4, TMP4);

	vis_faligndata(TMP0, TMP2, REF_0);

	vis_xor(DST_0, REF_0, TMP0);

	vis_and(TMP0, MASK_fe, TMP0);

	vis_and(TMP4, MASK_7f, TMP4);

	vis_psub16(TMP6, TMP4, TMP4);
	vis_st64(TMP4, dest[0]);
	dest += stride;
	vis_mul8x16(CONST_128, TMP0, TMP0);

	vis_or(DST_0, REF_0, TMP6);

	vis_and(TMP0, MASK_7f, TMP0);

	vis_psub16(TMP6, TMP0, TMP4);
	vis_st64(TMP4, dest[0]);
}

static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * _ref,
			     const int stride, int height)
{
	uint8_t *ref = (uint8_t *) _ref;
	unsigned long off = (unsigned long) ref & 0x7;
	unsigned long off_plus_1 = off + 1;

	ref = vis_alignaddr(ref);

	vis_ld64(ref[0],    TMP0);

	vis_ld64_2(ref, 8,  TMP2);

	vis_ld64_2(ref, 16, TMP4);

	vis_ld64(constants_fe[0], MASK_fe);

	vis_ld64(constants_7f[0], MASK_7f);
	vis_faligndata(TMP0, TMP2, REF_0);

	vis_ld64(constants128[0], CONST_128);
	vis_faligndata(TMP2, TMP4, REF_4);

	if (off != 0x7) {
		vis_alignaddr_g0((void *)off_plus_1);
		vis_faligndata(TMP0, TMP2, REF_2);
		vis_faligndata(TMP2, TMP4, REF_6);
	} else {
		vis_src1(TMP2, REF_2);
		vis_src1(TMP4, REF_6);
	}

	ref += stride;
	height = (height >> 1) - 1;

	do {	/* 34 cycles */
		vis_ld64(ref[0],    TMP0);
		vis_xor(REF_0, REF_2, TMP6);

		vis_ld64_2(ref, 8,  TMP2);
		vis_xor(REF_4, REF_6, TMP8);

		vis_ld64_2(ref, 16, TMP4);
		vis_and(TMP6, MASK_fe, TMP6);
		ref += stride;

		vis_ld64(ref[0],    TMP14);
		vis_mul8x16(CONST_128, TMP6, TMP6);
		vis_and(TMP8, MASK_fe, TMP8);

		vis_ld64_2(ref, 8,  TMP16);
		vis_mul8x16(CONST_128, TMP8, TMP8);
		vis_or(REF_0, REF_2, TMP10);

		vis_ld64_2(ref, 16, TMP18);
		ref += stride;
		vis_or(REF_4, REF_6, TMP12);

		vis_alignaddr_g0((void *)off);

		vis_faligndata(TMP0, TMP2, REF_0);

		vis_faligndata(TMP2, TMP4, REF_4);

		if (off != 0x7) {
			vis_alignaddr_g0((void *)off_plus_1);
			vis_faligndata(TMP0, TMP2, REF_2);
			vis_faligndata(TMP2, TMP4, REF_6);
		} else {
			vis_src1(TMP2, REF_2);
			vis_src1(TMP4, REF_6);
		}

		vis_and(TMP6, MASK_7f, TMP6);

		vis_and(TMP8, MASK_7f, TMP8);

		vis_psub16(TMP10, TMP6, TMP6);
		vis_st64(TMP6, dest[0]);

		vis_psub16(TMP12, TMP8, TMP8);
		vis_st64_2(TMP8, dest, 8);
		dest += stride;

		vis_xor(REF_0, REF_2, TMP6);

		vis_xor(REF_4, REF_6, TMP8);

		vis_and(TMP6, MASK_fe, TMP6);

		vis_mul8x16(CONST_128, TMP6, TMP6);
		vis_and(TMP8, MASK_fe, TMP8);

		vis_mul8x16(CONST_128, TMP8, TMP8);
		vis_or(REF_0, REF_2, TMP10);

		vis_or(REF_4, REF_6, TMP12);

		vis_alignaddr_g0((void *)off);

		vis_faligndata(TMP14, TMP16, REF_0);

		vis_faligndata(TMP16, TMP18, REF_4);

		if (off != 0x7) {
			vis_alignaddr_g0((void *)off_plus_1);
			vis_faligndata(TMP14, TMP16, REF_2);
			vis_faligndata(TMP16, TMP18, REF_6);
		} else {
			vis_src1(TMP16, REF_2);
			vis_src1(TMP18, REF_6);
		}

		vis_and(TMP6, MASK_7f, TMP6);

		vis_and(TMP8, MASK_7f, TMP8);

		vis_psub16(TMP10, TMP6, TMP6);
		vis_st64(TMP6, dest[0]);

		vis_psub16(TMP12, TMP8, TMP8);
		vis_st64_2(TMP8, dest, 8);
		dest += stride;
	} while (--height);

	vis_ld64(ref[0],    TMP0);
	vis_xor(REF_0, REF_2, TMP6);

	vis_ld64_2(ref, 8,  TMP2);
	vis_xor(REF_4, REF_6, TMP8);

	vis_ld64_2(ref, 16, TMP4);
	vis_and(TMP6, MASK_fe, TMP6);

	vis_mul8x16(CONST_128, TMP6, TMP6);
	vis_and(TMP8, MASK_fe, TMP8);

	vis_mul8x16(CONST_128, TMP8, TMP8);
	vis_or(REF_0, REF_2, TMP10);

	vis_or(REF_4, REF_6, TMP12);

	vis_alignaddr_g0((void *)off);

	vis_faligndata(TMP0, TMP2, REF_0);

	vis_faligndata(TMP2, TMP4, REF_4);

	if (off != 0x7) {
		vis_alignaddr_g0((void *)off_plus_1);
		vis_faligndata(TMP0, TMP2, REF_2);
		vis_faligndata(TMP2, TMP4, REF_6);
	} else {
		vis_src1(TMP2, REF_2);
		vis_src1(TMP4, REF_6);
	}

	vis_and(TMP6, MASK_7f, TMP6);

	vis_and(TMP8, MASK_7f, TMP8);

	vis_psub16(TMP10, TMP6, TMP6);
	vis_st64(TMP6, dest[0]);

	vis_psub16(TMP12, TMP8, TMP8);
	vis_st64_2(TMP8, dest, 8);
	dest += stride;

	vis_xor(REF_0, REF_2, TMP6);

	vis_xor(REF_4, REF_6, TMP8);

	vis_and(TMP6, MASK_fe, TMP6);

	vis_mul8x16(CONST_128, TMP6, TMP6);
	vis_and(TMP8, MASK_fe, TMP8);

	vis_mul8x16(CONST_128, TMP8, TMP8);
	vis_or(REF_0, REF_2, TMP10);

	vis_or(REF_4, REF_6, TMP12);

	vis_and(TMP6, MASK_7f, TMP6);

	vis_and(TMP8, MASK_7f, TMP8);

	vis_psub16(TMP10, TMP6, TMP6);
	vis_st64(TMP6, dest[0]);

	vis_psub16(TMP12, TMP8, TMP8);
	vis_st64_2(TMP8, dest, 8);
}

static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * _ref,
			    const int stride, int height)
{
	uint8_t *ref = (uint8_t *) _ref;
	unsigned long off = (unsigned long) ref & 0x7;
	unsigned long off_plus_1 = off + 1;

	ref = vis_alignaddr(ref);

	vis_ld64(ref[0], TMP0);

	vis_ld64(ref[8], TMP2);

	vis_ld64(constants_fe[0], MASK_fe);

	vis_ld64(constants_7f[0], MASK_7f);

	vis_ld64(constants128[0], CONST_128);
	vis_faligndata(TMP0, TMP2, REF_0);

	if (off != 0x7) {
		vis_alignaddr_g0((void *)off_plus_1);
		vis_faligndata(TMP0, TMP2, REF_2);
	} else {
		vis_src1(TMP2, REF_2);
	}

	ref += stride;
	height = (height >> 1) - 1;

	do {	/* 20 cycles */
		vis_ld64(ref[0], TMP0);
		vis_xor(REF_0, REF_2, TMP4);

		vis_ld64_2(ref, 8, TMP2);
		vis_and(TMP4, MASK_fe, TMP4);
		ref += stride;

		vis_ld64(ref[0], TMP8);
		vis_or(REF_0, REF_2, TMP6);
		vis_mul8x16(CONST_128, TMP4, TMP4);

		vis_alignaddr_g0((void *)off);

		vis_ld64_2(ref, 8, TMP10);
		ref += stride;
		vis_faligndata(TMP0, TMP2, REF_0);

		if (off != 0x7) {
			vis_alignaddr_g0((void *)off_plus_1);
			vis_faligndata(TMP0, TMP2, REF_2);
		} else {
			vis_src1(TMP2, REF_2);
		}

		vis_and(TMP4, MASK_7f, TMP4);

		vis_psub16(TMP6, TMP4, DST_0);
		vis_st64(DST_0, dest[0]);
		dest += stride;

		vis_xor(REF_0, REF_2, TMP12);

		vis_and(TMP12, MASK_fe, TMP12);

		vis_or(REF_0, REF_2, TMP14);
		vis_mul8x16(CONST_128, TMP12, TMP12);

		vis_alignaddr_g0((void *)off);
		vis_faligndata(TMP8, TMP10, REF_0);
		if (off != 0x7) {
			vis_alignaddr_g0((void *)off_plus_1);
			vis_faligndata(TMP8, TMP10, REF_2);
		} else {
			vis_src1(TMP10, REF_2);
		}

		vis_and(TMP12, MASK_7f, TMP12);

		vis_psub16(TMP14, TMP12, DST_0);
		vis_st64(DST_0, dest[0]);
		dest += stride;
	} while (--height);

	vis_ld64(ref[0], TMP0);
	vis_xor(REF_0, REF_2, TMP4);

	vis_ld64_2(ref, 8, TMP2);
	vis_and(TMP4, MASK_fe, TMP4);

	vis_or(REF_0, REF_2, TMP6);
	vis_mul8x16(CONST_128, TMP4, TMP4);

	vis_alignaddr_g0((void *)off);

	vis_faligndata(TMP0, TMP2, REF_0);

	if (off != 0x7) {
		vis_alignaddr_g0((void *)off_plus_1);
		vis_faligndata(TMP0, TMP2, REF_2);
	} else {
		vis_src1(TMP2, REF_2);
	}

	vis_and(TMP4, MASK_7f, TMP4);

	vis_psub16(TMP6, TMP4, DST_0);
	vis_st64(DST_0, dest[0]);
	dest += stride;

	vis_xor(REF_0, REF_2, TMP12);

	vis_and(TMP12, MASK_fe, TMP12);

	vis_or(REF_0, REF_2, TMP14);
	vis_mul8x16(CONST_128, TMP12, TMP12);

	vis_and(TMP12, MASK_7f, TMP12);

	vis_psub16(TMP14, TMP12, DST_0);
	vis_st64(DST_0, dest[0]);
	dest += stride;
}

static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * _ref,
			     const int stride, int height)
{
	uint8_t *ref = (uint8_t *) _ref;
	unsigned long off = (unsigned long) ref & 0x7;
	unsigned long off_plus_1 = off + 1;

	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);

	vis_ld64(constants3[0], CONST_3);
	vis_fzero(ZERO);
	vis_ld64(constants256_512[0], CONST_256);

	ref = vis_alignaddr(ref);
	do {	/* 26 cycles */
		vis_ld64(ref[0], TMP0);

		vis_ld64(ref[8], TMP2);

		vis_alignaddr_g0((void *)off);

		vis_ld64(ref[16], TMP4);

		vis_ld64(dest[0], DST_0);
		vis_faligndata(TMP0, TMP2, REF_0);

		vis_ld64(dest[8], DST_2);
		vis_faligndata(TMP2, TMP4, REF_4);

		if (off != 0x7) {
			vis_alignaddr_g0((void *)off_plus_1);
			vis_faligndata(TMP0, TMP2, REF_2);
			vis_faligndata(TMP2, TMP4, REF_6);
		} else {
			vis_src1(TMP2, REF_2);
			vis_src1(TMP4, REF_6);
		}

		vis_mul8x16au(REF_0,   CONST_256, TMP0);

		vis_pmerge(ZERO,     REF_2,     TMP4);
		vis_mul8x16au(REF_0_1, CONST_256, TMP2);

		vis_pmerge(ZERO, REF_2_1, TMP6);

		vis_padd16(TMP0, TMP4, TMP0);

		vis_mul8x16al(DST_0,   CONST_512, TMP4);
		vis_padd16(TMP2, TMP6, TMP2);

		vis_mul8x16al(DST_1,   CONST_512, TMP6);

		vis_mul8x16au(REF_6,   CONST_256, TMP12);

		vis_padd16(TMP0, TMP4, TMP0);
		vis_mul8x16au(REF_6_1, CONST_256, TMP14);

		vis_padd16(TMP2, TMP6, TMP2);
		vis_mul8x16au(REF_4,   CONST_256, TMP16);

		vis_padd16(TMP0, CONST_3, TMP8);
		vis_mul8x16au(REF_4_1, CONST_256, TMP18);

		vis_padd16(TMP2, CONST_3, TMP10);
		vis_pack16(TMP8, DST_0);

		vis_pack16(TMP10, DST_1);
		vis_padd16(TMP16, TMP12, TMP0);

		vis_st64(DST_0, dest[0]);
		vis_mul8x16al(DST_2,   CONST_512, TMP4);
		vis_padd16(TMP18, TMP14, TMP2);

		vis_mul8x16al(DST_3,   CONST_512, TMP6);
		vis_padd16(TMP0, CONST_3, TMP0);

		vis_padd16(TMP2, CONST_3, TMP2);

		vis_padd16(TMP0, TMP4, TMP0);

		vis_padd16(TMP2, TMP6, TMP2);
		vis_pack16(TMP0, DST_2);

		vis_pack16(TMP2, DST_3);
		vis_st64(DST_2, dest[8]);

		ref += stride;
		dest += stride;
	} while (--height);
}

static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * _ref,
			    const int stride, int height)
{
	uint8_t *ref = (uint8_t *) _ref;
	unsigned long off = (unsigned long) ref & 0x7;
	unsigned long off_plus_1 = off + 1;
	int stride_times_2 = stride << 1;

	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);

	vis_ld64(constants3[0], CONST_3);
	vis_fzero(ZERO);
	vis_ld64(constants256_512[0], CONST_256);

	ref = vis_alignaddr(ref);
	height >>= 2;
	do {	/* 47 cycles */
		vis_ld64(ref[0],   TMP0);

		vis_ld64_2(ref, 8, TMP2);
		ref += stride;

		vis_alignaddr_g0((void *)off);

		vis_ld64(ref[0],   TMP4);
		vis_faligndata(TMP0, TMP2, REF_0);

		vis_ld64_2(ref, 8, TMP6);
		ref += stride;

		vis_ld64(ref[0],   TMP8);

		vis_ld64_2(ref, 8, TMP10);
		ref += stride;
		vis_faligndata(TMP4, TMP6, REF_4);

		vis_ld64(ref[0],   TMP12);

		vis_ld64_2(ref, 8, TMP14);
		ref += stride;
		vis_faligndata(TMP8, TMP10, REF_S0);

		vis_faligndata(TMP12, TMP14, REF_S4);

		if (off != 0x7) {
			vis_alignaddr_g0((void *)off_plus_1);

			vis_ld64(dest[0], DST_0);
			vis_faligndata(TMP0, TMP2, REF_2);

			vis_ld64_2(dest, stride, DST_2);
			vis_faligndata(TMP4, TMP6, REF_6);

			vis_faligndata(TMP8, TMP10, REF_S2);

			vis_faligndata(TMP12, TMP14, REF_S6);
		} else {
			vis_ld64(dest[0], DST_0);
			vis_src1(TMP2, REF_2);

			vis_ld64_2(dest, stride, DST_2);
			vis_src1(TMP6, REF_6);

			vis_src1(TMP10, REF_S2);

			vis_src1(TMP14, REF_S6);
		}

		vis_pmerge(ZERO,     REF_0,     TMP0);
		vis_mul8x16au(REF_0_1, CONST_256, TMP2);

		vis_pmerge(ZERO,     REF_2,     TMP4);
		vis_mul8x16au(REF_2_1, CONST_256, TMP6);

		vis_padd16(TMP0, CONST_3, TMP0);
		vis_mul8x16al(DST_0,   CONST_512, TMP16);

		vis_padd16(TMP2, CONST_3, TMP2);
		vis_mul8x16al(DST_1,   CONST_512, TMP18);

		vis_padd16(TMP0, TMP4, TMP0);
		vis_mul8x16au(REF_4, CONST_256, TMP8);

		vis_padd16(TMP2, TMP6, TMP2);
		vis_mul8x16au(REF_4_1, CONST_256, TMP10);

		vis_padd16(TMP0, TMP16, TMP0);
		vis_mul8x16au(REF_6, CONST_256, TMP12);

		vis_padd16(TMP2, TMP18, TMP2);
		vis_mul8x16au(REF_6_1, CONST_256, TMP14);

		vis_padd16(TMP8, CONST_3, TMP8);
		vis_mul8x16al(DST_2, CONST_512, TMP16);

		vis_padd16(TMP8, TMP12, TMP8);
		vis_mul8x16al(DST_3, CONST_512, TMP18);

		vis_padd16(TMP10, TMP14, TMP10);
		vis_pack16(TMP0, DST_0);

		vis_pack16(TMP2, DST_1);
		vis_st64(DST_0, dest[0]);
		dest += stride;
		vis_padd16(TMP10, CONST_3, TMP10);

		vis_ld64_2(dest, stride, DST_0);
		vis_padd16(TMP8, TMP16, TMP8);

		vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
		vis_padd16(TMP10, TMP18, TMP10);
		vis_pack16(TMP8, DST_2);

		vis_pack16(TMP10, DST_3);
		vis_st64(DST_2, dest[0]);
		dest += stride;

		vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
		vis_pmerge(ZERO,     REF_S0,     TMP0);

		vis_pmerge(ZERO,     REF_S2,     TMP24);
		vis_mul8x16au(REF_S2_1, CONST_256, TMP6);

		vis_padd16(TMP0, CONST_3, TMP0);
		vis_mul8x16au(REF_S4, CONST_256, TMP8);

		vis_padd16(TMP2, CONST_3, TMP2);
		vis_mul8x16au(REF_S4_1, CONST_256, TMP10);

		vis_padd16(TMP0, TMP24, TMP0);
		vis_mul8x16au(REF_S6, CONST_256, TMP12);

		vis_padd16(TMP2, TMP6, TMP2);
		vis_mul8x16au(REF_S6_1, CONST_256, TMP14);

		vis_padd16(TMP8, CONST_3, TMP8);
		vis_mul8x16al(DST_0,   CONST_512, TMP16);

		vis_padd16(TMP10, CONST_3, TMP10);
		vis_mul8x16al(DST_1,   CONST_512, TMP18);

		vis_padd16(TMP8, TMP12, TMP8);
		vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);

		vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
		vis_padd16(TMP0, TMP16, TMP0);

		vis_padd16(TMP2, TMP18, TMP2);
		vis_pack16(TMP0, DST_0);

		vis_padd16(TMP10, TMP14, TMP10);
		vis_pack16(TMP2, DST_1);
		vis_st64(DST_0, dest[0]);
		dest += stride;

		vis_padd16(TMP8, TMP20, TMP8);

		vis_padd16(TMP10, TMP22, TMP10);
		vis_pack16(TMP8, DST_2);

		vis_pack16(TMP10, DST_3);
		vis_st64(DST_2, dest[0]);
		dest += stride;
	} while (--height);
}

static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * _ref,
			     const int stride, int height)
{
	uint8_t *ref = (uint8_t *) _ref;
	int offset;

	ref = vis_alignaddr(ref);
	offset = (ref != _ref) ? 16 : 0;

	vis_ld64(ref[0], TMP0);

	vis_ld64_2(ref, 8, TMP2);

	vis_ld64_2(ref, offset, TMP4);
	ref += stride;

	vis_ld64(ref[0], TMP6);
	vis_faligndata(TMP0, TMP2, REF_0);

	vis_ld64_2(ref, 8, TMP8);
	vis_faligndata(TMP2, TMP4, REF_4);

	vis_ld64_2(ref, offset, TMP10);
	ref += stride;

	vis_ld64(constants_fe[0], MASK_fe);
	vis_faligndata(TMP6, TMP8, REF_2);

	vis_ld64(constants_7f[0], MASK_7f);
	vis_faligndata(TMP8, TMP10, REF_6);

	vis_ld64(constants128[0], CONST_128);
	height = (height >> 1) - 1;
	do {	/* 24 cycles */
		vis_ld64(ref[0], TMP0);
		vis_xor(REF_0, REF_2, TMP12);

		vis_ld64_2(ref, 8, TMP2);
		vis_xor(REF_4, REF_6, TMP16);

		vis_ld64_2(ref, offset, TMP4);
		ref += stride;
		vis_or(REF_0, REF_2, TMP14);

		vis_ld64(ref[0], TMP6);
		vis_or(REF_4, REF_6, TMP18);

		vis_ld64_2(ref, 8, TMP8);
		vis_faligndata(TMP0, TMP2, REF_0);

		vis_ld64_2(ref, offset, TMP10);
		ref += stride;
		vis_faligndata(TMP2, TMP4, REF_4);

		vis_and(TMP12, MASK_fe, TMP12);

		vis_and(TMP16, MASK_fe, TMP16);
		vis_mul8x16(CONST_128, TMP12, TMP12);

		vis_mul8x16(CONST_128, TMP16, TMP16);
		vis_xor(REF_0, REF_2, TMP0);

		vis_xor(REF_4, REF_6, TMP2);

		vis_or(REF_0, REF_2, TMP20);

		vis_and(TMP12, MASK_7f, TMP12);

		vis_and(TMP16, MASK_7f, TMP16);

		vis_psub16(TMP14, TMP12, TMP12);
		vis_st64(TMP12, dest[0]);

		vis_psub16(TMP18, TMP16, TMP16);
		vis_st64_2(TMP16, dest, 8);
		dest += stride;

		vis_or(REF_4, REF_6, TMP18);

		vis_and(TMP0, MASK_fe, TMP0);

		vis_and(TMP2, MASK_fe, TMP2);
		vis_mul8x16(CONST_128, TMP0, TMP0);

		vis_faligndata(TMP6, TMP8, REF_2);
		vis_mul8x16(CONST_128, TMP2, TMP2);

		vis_faligndata(TMP8, TMP10, REF_6);

		vis_and(TMP0, MASK_7f, TMP0);

		vis_and(TMP2, MASK_7f, TMP2);

		vis_psub16(TMP20, TMP0, TMP0);
		vis_st64(TMP0, dest[0]);

		vis_psub16(TMP18, TMP2, TMP2);
		vis_st64_2(TMP2, dest, 8);
		dest += stride;
	} while (--height);

	vis_ld64(ref[0], TMP0);
	vis_xor(REF_0, REF_2, TMP12);

	vis_ld64_2(ref, 8, TMP2);
	vis_xor(REF_4, REF_6, TMP16);

	vis_ld64_2(ref, offset, TMP4);
	vis_or(REF_0, REF_2, TMP14);

	vis_or(REF_4, REF_6, TMP18);

	vis_faligndata(TMP0, TMP2, REF_0);

	vis_faligndata(TMP2, TMP4, REF_4);

	vis_and(TMP12, MASK_fe, TMP12);

	vis_and(TMP16, MASK_fe, TMP16);
	vis_mul8x16(CONST_128, TMP12, TMP12);

	vis_mul8x16(CONST_128, TMP16, TMP16);
	vis_xor(REF_0, REF_2, TMP0);

	vis_xor(REF_4, REF_6, TMP2);

	vis_or(REF_0, REF_2, TMP20);

	vis_and(TMP12, MASK_7f, TMP12);

	vis_and(TMP16, MASK_7f, TMP16);

	vis_psub16(TMP14, TMP12, TMP12);
	vis_st64(TMP12, dest[0]);

	vis_psub16(TMP18, TMP16, TMP16);
	vis_st64_2(TMP16, dest, 8);
	dest += stride;

	vis_or(REF_4, REF_6, TMP18);

	vis_and(TMP0, MASK_fe, TMP0);

	vis_and(TMP2, MASK_fe, TMP2);
	vis_mul8x16(CONST_128, TMP0, TMP0);

	vis_mul8x16(CONST_128, TMP2, TMP2);

	vis_and(TMP0, MASK_7f, TMP0);

	vis_and(TMP2, MASK_7f, TMP2);

	vis_psub16(TMP20, TMP0, TMP0);
	vis_st64(TMP0, dest[0]);

	vis_psub16(TMP18, TMP2, TMP2);
	vis_st64_2(TMP2, dest, 8);
}

static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * _ref,
			    const int stride, int height)
{
	uint8_t *ref = (uint8_t *) _ref;
	int offset;

	ref = vis_alignaddr(ref);
	offset = (ref != _ref) ? 8 : 0;

	vis_ld64(ref[0], TMP0);

	vis_ld64_2(ref, offset, TMP2);
	ref += stride;

	vis_ld64(ref[0], TMP4);

	vis_ld64_2(ref, offset, TMP6);
	ref += stride;

	vis_ld64(constants_fe[0], MASK_fe);
	vis_faligndata(TMP0, TMP2, REF_0);

	vis_ld64(constants_7f[0], MASK_7f);
	vis_faligndata(TMP4, TMP6, REF_2);

	vis_ld64(constants128[0], CONST_128);
	height = (height >> 1) - 1;
	do {	/* 12 cycles */
		vis_ld64(ref[0], TMP0);
		vis_xor(REF_0, REF_2, TMP4);

		vis_ld64_2(ref, offset, TMP2);
		ref += stride;
		vis_and(TMP4, MASK_fe, TMP4);

		vis_or(REF_0, REF_2, TMP6);
		vis_mul8x16(CONST_128, TMP4, TMP4);

		vis_faligndata(TMP0, TMP2, REF_0);
		vis_ld64(ref[0], TMP0);

		vis_ld64_2(ref, offset, TMP2);
		ref += stride;
		vis_xor(REF_0, REF_2, TMP12);

		vis_and(TMP4, MASK_7f, TMP4);

		vis_and(TMP12, MASK_fe, TMP12);

		vis_mul8x16(CONST_128, TMP12, TMP12);
		vis_or(REF_0, REF_2, TMP14);

		vis_psub16(TMP6, TMP4, DST_0);
		vis_st64(DST_0, dest[0]);
		dest += stride;

		vis_faligndata(TMP0, TMP2, REF_2);

		vis_and(TMP12, MASK_7f, TMP12);

		vis_psub16(TMP14, TMP12, DST_0);
		vis_st64(DST_0, dest[0]);
		dest += stride;
	} while (--height);

	vis_ld64(ref[0], TMP0);
	vis_xor(REF_0, REF_2, TMP4);

	vis_ld64_2(ref, offset, TMP2);
	vis_and(TMP4, MASK_fe, TMP4);

	vis_or(REF_0, REF_2, TMP6);
	vis_mul8x16(CONST_128, TMP4, TMP4);

	vis_faligndata(TMP0, TMP2, REF_0);

	vis_xor(REF_0, REF_2, TMP12);

	vis_and(TMP4, MASK_7f, TMP4);

	vis_and(TMP12, MASK_fe, TMP12);

	vis_mul8x16(CONST_128, TMP12, TMP12);
	vis_or(REF_0, REF_2, TMP14);

	vis_psub16(TMP6, TMP4, DST_0);
	vis_st64(DST_0, dest[0]);
	dest += stride;

	vis_and(TMP12, MASK_7f, TMP12);

	vis_psub16(TMP14, TMP12, DST_0);
	vis_st64(DST_0, dest[0]);
}

static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * _ref,
			     const int stride, int height)
{
	uint8_t *ref = (uint8_t *) _ref;
	int stride_8 = stride + 8;
	int stride_16;
	int offset;

	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);

	ref = vis_alignaddr(ref);
	offset = (ref != _ref) ? 16 : 0;

	vis_ld64(ref[ 0], TMP0);
	vis_fzero(ZERO);

	vis_ld64(ref[ 8], TMP2);

	vis_ld64_2(ref, offset, TMP4);
	stride_16 = stride + offset;

	vis_ld64(constants3[0], CONST_3);
	vis_faligndata(TMP0, TMP2, REF_2);

	vis_ld64(constants256_512[0], CONST_256);
	vis_faligndata(TMP2, TMP4, REF_6);
	height >>= 1;

	do {	/* 31 cycles */
		vis_ld64_2(ref, stride, TMP0);
		vis_pmerge(ZERO,       REF_2,     TMP12);
		vis_mul8x16au(REF_2_1, CONST_256, TMP14);

		vis_ld64_2(ref, stride_8, TMP2);
		vis_pmerge(ZERO,       REF_6,     TMP16);
		vis_mul8x16au(REF_6_1, CONST_256, TMP18);

		vis_ld64_2(ref, stride_16, TMP4);
		ref += stride;

		vis_ld64(dest[0], DST_0);
		vis_faligndata(TMP0, TMP2, REF_0);

		vis_ld64_2(dest, 8, DST_2);
		vis_faligndata(TMP2, TMP4, REF_4);

		vis_ld64_2(ref, stride, TMP6);
		vis_pmerge(ZERO,     REF_0,     TMP0);
		vis_mul8x16au(REF_0_1, CONST_256, TMP2);

		vis_ld64_2(ref, stride_8, TMP8);
		vis_pmerge(ZERO,     REF_4,     TMP4);

		vis_ld64_2(ref, stride_16, TMP10);
		ref += stride;

		vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
		vis_faligndata(TMP6, TMP8, REF_2);
		vis_mul8x16au(REF_4_1, CONST_256, TMP6);

		vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
		vis_faligndata(TMP8, TMP10, REF_6);
		vis_mul8x16al(DST_0,   CONST_512, TMP20);

		vis_padd16(TMP0, CONST_3, TMP0);
		vis_mul8x16al(DST_1,   CONST_512, TMP22);

		vis_padd16(TMP2, CONST_3, TMP2);
		vis_mul8x16al(DST_2,   CONST_512, TMP24);

		vis_padd16(TMP4, CONST_3, TMP4);
		vis_mul8x16al(DST_3,   CONST_512, TMP26);

		vis_padd16(TMP6, CONST_3, TMP6);

		vis_padd16(TMP12, TMP20, TMP12);
		vis_mul8x16al(REF_S0,   CONST_512, TMP20);

		vis_padd16(TMP14, TMP22, TMP14);
		vis_mul8x16al(REF_S0_1, CONST_512, TMP22);

		vis_padd16(TMP16, TMP24, TMP16);
		vis_mul8x16al(REF_S2,   CONST_512, TMP24);

		vis_padd16(TMP18, TMP26, TMP18);
		vis_mul8x16al(REF_S2_1, CONST_512, TMP26);

		vis_padd16(TMP12, TMP0, TMP12);
		vis_mul8x16au(REF_2,   CONST_256, TMP28);

		vis_padd16(TMP14, TMP2, TMP14);
		vis_mul8x16au(REF_2_1, CONST_256, TMP30);

		vis_padd16(TMP16, TMP4, TMP16);
		vis_mul8x16au(REF_6,   CONST_256, REF_S4);

		vis_padd16(TMP18, TMP6, TMP18);
		vis_mul8x16au(REF_6_1, CONST_256, REF_S6);

		vis_pack16(TMP12, DST_0);
		vis_padd16(TMP28, TMP0, TMP12);

		vis_pack16(TMP14, DST_1);
		vis_st64(DST_0, dest[0]);
		vis_padd16(TMP30, TMP2, TMP14);

		vis_pack16(TMP16, DST_2);
		vis_padd16(REF_S4, TMP4, TMP16);

		vis_pack16(TMP18, DST_3);
		vis_st64_2(DST_2, dest, 8);
		dest += stride;
		vis_padd16(REF_S6, TMP6, TMP18);

		vis_padd16(TMP12, TMP20, TMP12);

		vis_padd16(TMP14, TMP22, TMP14);
		vis_pack16(TMP12, DST_0);

		vis_padd16(TMP16, TMP24, TMP16);
		vis_pack16(TMP14, DST_1);
		vis_st64(DST_0, dest[0]);

		vis_padd16(TMP18, TMP26, TMP18);
		vis_pack16(TMP16, DST_2);

		vis_pack16(TMP18, DST_3);
		vis_st64_2(DST_2, dest, 8);
		dest += stride;
	} while (--height);
}

static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * _ref,
			    const int stride, int height)
{
	uint8_t *ref = (uint8_t *) _ref;
	int stride_8;
	int offset;

	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);

	ref = vis_alignaddr(ref);
	offset = (ref != _ref) ? 8 : 0;

	vis_ld64(ref[ 0], TMP0);
	vis_fzero(ZERO);

	vis_ld64_2(ref, offset, TMP2);
	stride_8 = stride + offset;

	vis_ld64(constants3[0], CONST_3);
	vis_faligndata(TMP0, TMP2, REF_2);

	vis_ld64(constants256_512[0], CONST_256);

	height >>= 1;
	do {	/* 20 cycles */
		vis_ld64_2(ref, stride, TMP0);
		vis_pmerge(ZERO,       REF_2,     TMP8);
		vis_mul8x16au(REF_2_1, CONST_256, TMP10);

		vis_ld64_2(ref, stride_8, TMP2);
		ref += stride;

		vis_ld64(dest[0], DST_0);

		vis_ld64_2(dest, stride, DST_2);
		vis_faligndata(TMP0, TMP2, REF_0);

		vis_ld64_2(ref, stride, TMP4);
		vis_mul8x16al(DST_0,   CONST_512, TMP16);
		vis_pmerge(ZERO,       REF_0,     TMP12);

		vis_ld64_2(ref, stride_8, TMP6);
		ref += stride;
		vis_mul8x16al(DST_1,   CONST_512, TMP18);
		vis_pmerge(ZERO,       REF_0_1,   TMP14);

		vis_padd16(TMP12, CONST_3, TMP12);
		vis_mul8x16al(DST_2,   CONST_512, TMP24);

		vis_padd16(TMP14, CONST_3, TMP14);
		vis_mul8x16al(DST_3,   CONST_512, TMP26);

		vis_faligndata(TMP4, TMP6, REF_2);

		vis_padd16(TMP8, TMP12, TMP8);

		vis_padd16(TMP10, TMP14, TMP10);
		vis_mul8x16au(REF_2,   CONST_256, TMP20);

		vis_padd16(TMP8, TMP16, TMP0);
		vis_mul8x16au(REF_2_1, CONST_256, TMP22);

		vis_padd16(TMP10, TMP18, TMP2);
		vis_pack16(TMP0, DST_0);

		vis_pack16(TMP2, DST_1);
		vis_st64(DST_0, dest[0]);
		dest += stride;
		vis_padd16(TMP12, TMP20, TMP12);

		vis_padd16(TMP14, TMP22, TMP14);

		vis_padd16(TMP12, TMP24, TMP0);

		vis_padd16(TMP14, TMP26, TMP2);
		vis_pack16(TMP0, DST_2);

		vis_pack16(TMP2, DST_3);
		vis_st64(DST_2, dest[0]);
		dest += stride;
	} while (--height);
}

static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
			      const int stride, int height)
{
	uint8_t *ref = (uint8_t *) _ref;
	unsigned long off = (unsigned long) ref & 0x7;
	unsigned long off_plus_1 = off + 1;
	int stride_8 = stride + 8;
	int stride_16 = stride + 16;

	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);

	ref = vis_alignaddr(ref);

	vis_ld64(ref[ 0], TMP0);
	vis_fzero(ZERO);

	vis_ld64(ref[ 8], TMP2);

	vis_ld64(ref[16], TMP4);

	vis_ld64(constants2[0], CONST_2);
	vis_faligndata(TMP0, TMP2, REF_S0);

	vis_ld64(constants256_512[0], CONST_256);
	vis_faligndata(TMP2, TMP4, REF_S4);

	if (off != 0x7) {
		vis_alignaddr_g0((void *)off_plus_1);
		vis_faligndata(TMP0, TMP2, REF_S2);
		vis_faligndata(TMP2, TMP4, REF_S6);
	} else {
		vis_src1(TMP2, REF_S2);
		vis_src1(TMP4, REF_S6);
	}

	height >>= 1;
	do {
		vis_ld64_2(ref, stride, TMP0);
		vis_mul8x16au(REF_S0, CONST_256, TMP12);
		vis_pmerge(ZERO,      REF_S0_1,  TMP14);

		vis_alignaddr_g0((void *)off);

		vis_ld64_2(ref, stride_8, TMP2);
		vis_mul8x16au(REF_S2, CONST_256, TMP16);
		vis_pmerge(ZERO,      REF_S2_1,  TMP18);

		vis_ld64_2(ref, stride_16, TMP4);
		ref += stride;
		vis_mul8x16au(REF_S4, CONST_256, TMP20);
		vis_pmerge(ZERO,      REF_S4_1,  TMP22);

		vis_ld64_2(ref, stride, TMP6);
		vis_mul8x16au(REF_S6, CONST_256, TMP24);
		vis_pmerge(ZERO,      REF_S6_1,  TMP26);

		vis_ld64_2(ref, stride_8, TMP8);
		vis_faligndata(TMP0, TMP2, REF_0);

		vis_ld64_2(ref, stride_16, TMP10);
		ref += stride;
		vis_faligndata(TMP2, TMP4, REF_4);

		vis_faligndata(TMP6, TMP8, REF_S0);

		vis_faligndata(TMP8, TMP10, REF_S4);

		if (off != 0x7) {
			vis_alignaddr_g0((void *)off_plus_1);
			vis_faligndata(TMP0, TMP2, REF_2);
			vis_faligndata(TMP2, TMP4, REF_6);
			vis_faligndata(TMP6, TMP8, REF_S2);
			vis_faligndata(TMP8, TMP10, REF_S6);
		} else {
			vis_src1(TMP2, REF_2);
			vis_src1(TMP4, REF_6);
			vis_src1(TMP8, REF_S2);
			vis_src1(TMP10, REF_S6);
		}

		vis_mul8x16au(REF_0, CONST_256, TMP0);
		vis_pmerge(ZERO,      REF_0_1,  TMP2);

		vis_mul8x16au(REF_2, CONST_256, TMP4);
		vis_pmerge(ZERO,      REF_2_1,  TMP6);

		vis_padd16(TMP0, CONST_2, TMP8);
		vis_mul8x16au(REF_4, CONST_256, TMP0);

		vis_padd16(TMP2, CONST_2, TMP10);
		vis_mul8x16au(REF_4_1, CONST_256, TMP2);

		vis_padd16(TMP8, TMP4, TMP8);
		vis_mul8x16au(REF_6, CONST_256, TMP4);

		vis_padd16(TMP10, TMP6, TMP10);
		vis_mul8x16au(REF_6_1, CONST_256, TMP6);

		vis_padd16(TMP12, TMP8, TMP12);

		vis_padd16(TMP14, TMP10, TMP14);

		vis_padd16(TMP12, TMP16, TMP12);

		vis_padd16(TMP14, TMP18, TMP14);
		vis_pack16(TMP12, DST_0);

		vis_pack16(TMP14, DST_1);
		vis_st64(DST_0, dest[0]);
		vis_padd16(TMP0, CONST_2, TMP12);

		vis_mul8x16au(REF_S0, CONST_256, TMP0);
		vis_padd16(TMP2, CONST_2, TMP14);

		vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
		vis_padd16(TMP12, TMP4, TMP12);

		vis_mul8x16au(REF_S2, CONST_256, TMP4);
		vis_padd16(TMP14, TMP6, TMP14);

		vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
		vis_padd16(TMP20, TMP12, TMP20);

		vis_padd16(TMP22, TMP14, TMP22);

		vis_padd16(TMP20, TMP24, TMP20);

		vis_padd16(TMP22, TMP26, TMP22);
		vis_pack16(TMP20, DST_2);

		vis_pack16(TMP22, DST_3);
		vis_st64_2(DST_2, dest, 8);
		dest += stride;
		vis_padd16(TMP0, TMP4, TMP24);

		vis_mul8x16au(REF_S4, CONST_256, TMP0);
		vis_padd16(TMP2, TMP6, TMP26);

		vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
		vis_padd16(TMP24, TMP8, TMP24);

		vis_padd16(TMP26, TMP10, TMP26);
		vis_pack16(TMP24, DST_0);

		vis_pack16(TMP26, DST_1);
		vis_st64(DST_0, dest[0]);
		vis_pmerge(ZERO, REF_S6, TMP4);

		vis_pmerge(ZERO,      REF_S6_1,  TMP6);

		vis_padd16(TMP0, TMP4, TMP0);

		vis_padd16(TMP2, TMP6, TMP2);

		vis_padd16(TMP0, TMP12, TMP0);

		vis_padd16(TMP2, TMP14, TMP2);
		vis_pack16(TMP0, DST_2);

		vis_pack16(TMP2, DST_3);
		vis_st64_2(DST_2, dest, 8);
		dest += stride;
	} while (--height);
}

static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
			     const int stride, int height)
{
	uint8_t *ref = (uint8_t *) _ref;
	unsigned long off = (unsigned long) ref & 0x7;
	unsigned long off_plus_1 = off + 1;
	int stride_8 = stride + 8;

	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);

	ref = vis_alignaddr(ref);

	vis_ld64(ref[ 0], TMP0);
	vis_fzero(ZERO);

	vis_ld64(ref[ 8], TMP2);

	vis_ld64(constants2[0], CONST_2);

	vis_ld64(constants256_512[0], CONST_256);
	vis_faligndata(TMP0, TMP2, REF_S0);

	if (off != 0x7) {
		vis_alignaddr_g0((void *)off_plus_1);
		vis_faligndata(TMP0, TMP2, REF_S2);
	} else {
		vis_src1(TMP2, REF_S2);
	}

	height >>= 1;
	do {	/* 26 cycles */
		vis_ld64_2(ref, stride, TMP0);
		vis_mul8x16au(REF_S0,   CONST_256, TMP8);
		vis_pmerge(ZERO,        REF_S2,    TMP12);

		vis_alignaddr_g0((void *)off);

		vis_ld64_2(ref, stride_8, TMP2);
		ref += stride;
		vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
		vis_pmerge(ZERO,        REF_S2_1,  TMP14);

		vis_ld64_2(ref, stride, TMP4);

		vis_ld64_2(ref, stride_8, TMP6);
		ref += stride;
		vis_faligndata(TMP0, TMP2, REF_S4);

		vis_pmerge(ZERO, REF_S4, TMP18);

		vis_pmerge(ZERO, REF_S4_1, TMP20);

		vis_faligndata(TMP4, TMP6, REF_S0);

		if (off != 0x7) {
			vis_alignaddr_g0((void *)off_plus_1);
			vis_faligndata(TMP0, TMP2, REF_S6);
			vis_faligndata(TMP4, TMP6, REF_S2);
		} else {
			vis_src1(TMP2, REF_S6);
			vis_src1(TMP6, REF_S2);
		}

		vis_padd16(TMP18, CONST_2, TMP18);
		vis_mul8x16au(REF_S6,   CONST_256, TMP22);

		vis_padd16(TMP20, CONST_2, TMP20);
		vis_mul8x16au(REF_S6_1, CONST_256, TMP24);

		vis_mul8x16au(REF_S0,   CONST_256, TMP26);
		vis_pmerge(ZERO, REF_S0_1, TMP28);

		vis_mul8x16au(REF_S2,   CONST_256, TMP30);
		vis_padd16(TMP18, TMP22, TMP18);

		vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
		vis_padd16(TMP20, TMP24, TMP20);

		vis_padd16(TMP8,  TMP18, TMP8);

		vis_padd16(TMP10, TMP20, TMP10);

		vis_padd16(TMP8,  TMP12, TMP8);

		vis_padd16(TMP10, TMP14, TMP10);
		vis_pack16(TMP8,  DST_0);

		vis_pack16(TMP10, DST_1);
		vis_st64(DST_0, dest[0]);
		dest += stride;
		vis_padd16(TMP18, TMP26, TMP18);

		vis_padd16(TMP20, TMP28, TMP20);

		vis_padd16(TMP18, TMP30, TMP18);

		vis_padd16(TMP20, TMP32, TMP20);
		vis_pack16(TMP18, DST_2);

		vis_pack16(TMP20, DST_3);
		vis_st64(DST_2, dest[0]);
		dest += stride;
	} while (--height);
}

static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
			      const int stride, int height)
{
	uint8_t *ref = (uint8_t *) _ref;
	unsigned long off = (unsigned long) ref & 0x7;
	unsigned long off_plus_1 = off + 1;
	int stride_8 = stride + 8;
	int stride_16 = stride + 16;

	vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);

	ref = vis_alignaddr(ref);

	vis_ld64(ref[ 0], TMP0);
	vis_fzero(ZERO);

	vis_ld64(ref[ 8], TMP2);

	vis_ld64(ref[16], TMP4);

	vis_ld64(constants6[0], CONST_6);
	vis_faligndata(TMP0, TMP2, REF_S0);

	vis_ld64(constants256_1024[0], CONST_256);
	vis_faligndata(TMP2, TMP4, REF_S4);

	if (off != 0x7) {
		vis_alignaddr_g0((void *)off_plus_1);
		vis_faligndata(TMP0, TMP2, REF_S2);
		vis_faligndata(TMP2, TMP4, REF_S6);
	} else {
		vis_src1(TMP2, REF_S2);
		vis_src1(TMP4, REF_S6);
	}

	height >>= 1;
	do {	/* 55 cycles */
		vis_ld64_2(ref, stride, TMP0);
		vis_mul8x16au(REF_S0, CONST_256, TMP12);
		vis_pmerge(ZERO,      REF_S0_1,  TMP14);

		vis_alignaddr_g0((void *)off);

		vis_ld64_2(ref, stride_8, TMP2);
		vis_mul8x16au(REF_S2, CONST_256, TMP16);
		vis_pmerge(ZERO,      REF_S2_1,  TMP18);

		vis_ld64_2(ref, stride_16, TMP4);
		ref += stride;
		vis_mul8x16au(REF_S4, CONST_256, TMP20);
		vis_pmerge(ZERO,      REF_S4_1,  TMP22);

		vis_ld64_2(ref, stride, TMP6);
		vis_mul8x16au(REF_S6, CONST_256, TMP24);
		vis_pmerge(ZERO,      REF_S6_1,  TMP26);

		vis_ld64_2(ref, stride_8, TMP8);
		vis_faligndata(TMP0, TMP2, REF_0);

		vis_ld64_2(ref, stride_16, TMP10);
		ref += stride;
		vis_faligndata(TMP2, TMP4, REF_4);

		vis_ld64(dest[0], DST_0);
		vis_faligndata(TMP6, TMP8, REF_S0);

		vis_ld64_2(dest, 8, DST_2);
		vis_faligndata(TMP8, TMP10, REF_S4);

		if (off != 0x7) {
			vis_alignaddr_g0((void *)off_plus_1);
			vis_faligndata(TMP0, TMP2, REF_2);
			vis_faligndata(TMP2, TMP4, REF_6);
			vis_faligndata(TMP6, TMP8, REF_S2);
			vis_faligndata(TMP8, TMP10, REF_S6);
		} else {
			vis_src1(TMP2, REF_2);
			vis_src1(TMP4, REF_6);
			vis_src1(TMP8, REF_S2);
			vis_src1(TMP10, REF_S6);
		}

		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
		vis_pmerge(ZERO, REF_0, TMP0);

		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
		vis_pmerge(ZERO,      REF_0_1,  TMP2);

		vis_mul8x16au(REF_2, CONST_256, TMP4);
		vis_pmerge(ZERO,      REF_2_1,  TMP6);

		vis_mul8x16al(DST_2,   CONST_1024, REF_0);
		vis_padd16(TMP0, CONST_6, TMP0);

		vis_mul8x16al(DST_3,   CONST_1024, REF_2);
		vis_padd16(TMP2, CONST_6, TMP2);

		vis_padd16(TMP0, TMP4, TMP0);
		vis_mul8x16au(REF_4, CONST_256, TMP4);

		vis_padd16(TMP2, TMP6, TMP2);
		vis_mul8x16au(REF_4_1, CONST_256, TMP6);

		vis_padd16(TMP12, TMP0, TMP12);
		vis_mul8x16au(REF_6, CONST_256, TMP8);

		vis_padd16(TMP14, TMP2, TMP14);
		vis_mul8x16au(REF_6_1, CONST_256, TMP10);

		vis_padd16(TMP12, TMP16, TMP12);
		vis_mul8x16au(REF_S0, CONST_256, REF_4);

		vis_padd16(TMP14, TMP18, TMP14);
		vis_mul8x16au(REF_S0_1, CONST_256, REF_6);

		vis_padd16(TMP12, TMP30, TMP12);

		vis_padd16(TMP14, TMP32, TMP14);
		vis_pack16(TMP12, DST_0);

		vis_pack16(TMP14, DST_1);
		vis_st64(DST_0, dest[0]);
		vis_padd16(TMP4, CONST_6, TMP4);

		vis_ld64_2(dest, stride, DST_0);
		vis_padd16(TMP6, CONST_6, TMP6);
		vis_mul8x16au(REF_S2, CONST_256, TMP12);

		vis_padd16(TMP4, TMP8, TMP4);
		vis_mul8x16au(REF_S2_1, CONST_256,  TMP14);

		vis_padd16(TMP6, TMP10, TMP6);

		vis_padd16(TMP20, TMP4, TMP20);

		vis_padd16(TMP22, TMP6, TMP22);

		vis_padd16(TMP20, TMP24, TMP20);

		vis_padd16(TMP22, TMP26, TMP22);

		vis_padd16(TMP20, REF_0, TMP20);
		vis_mul8x16au(REF_S4, CONST_256, REF_0);

		vis_padd16(TMP22, REF_2, TMP22);
		vis_pack16(TMP20, DST_2);

		vis_pack16(TMP22, DST_3);
		vis_st64_2(DST_2, dest, 8);
		dest += stride;

		vis_ld64_2(dest, 8, DST_2);
		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
		vis_pmerge(ZERO,      REF_S4_1,  REF_2);

		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
		vis_padd16(REF_4, TMP0, TMP8);

		vis_mul8x16au(REF_S6, CONST_256, REF_4);
		vis_padd16(REF_6, TMP2, TMP10);

		vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
		vis_padd16(TMP8, TMP12, TMP8);

		vis_padd16(TMP10, TMP14, TMP10);

		vis_padd16(TMP8, TMP30, TMP8);

		vis_padd16(TMP10, TMP32, TMP10);
		vis_pack16(TMP8, DST_0);

		vis_pack16(TMP10, DST_1);
		vis_st64(DST_0, dest[0]);

		vis_padd16(REF_0, TMP4, REF_0);

		vis_mul8x16al(DST_2,   CONST_1024, TMP30);
		vis_padd16(REF_2, TMP6, REF_2);

		vis_mul8x16al(DST_3,   CONST_1024, TMP32);
		vis_padd16(REF_0, REF_4, REF_0);

		vis_padd16(REF_2, REF_6, REF_2);

		vis_padd16(REF_0, TMP30, REF_0);

		/* stall */

		vis_padd16(REF_2, TMP32, REF_2);
		vis_pack16(REF_0, DST_2);

		vis_pack16(REF_2, DST_3);
		vis_st64_2(DST_2, dest, 8);
		dest += stride;
	} while (--height);
}

static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
			     const int stride, int height)
{
	uint8_t *ref = (uint8_t *) _ref;
	unsigned long off = (unsigned long) ref & 0x7;
	unsigned long off_plus_1 = off + 1;
	int stride_8 = stride + 8;

	vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);

	ref = vis_alignaddr(ref);

	vis_ld64(ref[0], TMP0);
	vis_fzero(ZERO);

	vis_ld64_2(ref, 8, TMP2);

	vis_ld64(constants6[0], CONST_6);

	vis_ld64(constants256_1024[0], CONST_256);
	vis_faligndata(TMP0, TMP2, REF_S0);

	if (off != 0x7) {
		vis_alignaddr_g0((void *)off_plus_1);
		vis_faligndata(TMP0, TMP2, REF_S2);
	} else {
		vis_src1(TMP2, REF_S2);
	}

	height >>= 1;
	do {	/* 31 cycles */
		vis_ld64_2(ref, stride, TMP0);
		vis_mul8x16au(REF_S0, CONST_256, TMP8);
		vis_pmerge(ZERO,      REF_S0_1,  TMP10);

		vis_ld64_2(ref, stride_8, TMP2);
		ref += stride;
		vis_mul8x16au(REF_S2, CONST_256, TMP12);
		vis_pmerge(ZERO,      REF_S2_1,  TMP14);

		vis_alignaddr_g0((void *)off);

		vis_ld64_2(ref, stride, TMP4);
		vis_faligndata(TMP0, TMP2, REF_S4);

		vis_ld64_2(ref, stride_8, TMP6);
		ref += stride;

		vis_ld64(dest[0], DST_0);
		vis_faligndata(TMP4, TMP6, REF_S0);

		vis_ld64_2(dest, stride, DST_2);

		if (off != 0x7) {
			vis_alignaddr_g0((void *)off_plus_1);
			vis_faligndata(TMP0, TMP2, REF_S6);
			vis_faligndata(TMP4, TMP6, REF_S2);
		} else {
			vis_src1(TMP2, REF_S6);
			vis_src1(TMP6, REF_S2);
		}

		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
		vis_pmerge(ZERO, REF_S4, TMP22);

		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
		vis_pmerge(ZERO,      REF_S4_1,  TMP24);

		vis_mul8x16au(REF_S6, CONST_256, TMP26);
		vis_pmerge(ZERO,      REF_S6_1,  TMP28);

		vis_mul8x16au(REF_S0, CONST_256, REF_S4);
		vis_padd16(TMP22, CONST_6, TMP22);

		vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
		vis_padd16(TMP24, CONST_6, TMP24);

		vis_mul8x16al(DST_2,   CONST_1024, REF_0);
		vis_padd16(TMP22, TMP26, TMP22);

		vis_mul8x16al(DST_3,   CONST_1024, REF_2);
		vis_padd16(TMP24, TMP28, TMP24);

		vis_mul8x16au(REF_S2, CONST_256, TMP26);
		vis_padd16(TMP8, TMP22, TMP8);

		vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
		vis_padd16(TMP10, TMP24, TMP10);

		vis_padd16(TMP8, TMP12, TMP8);

		vis_padd16(TMP10, TMP14, TMP10);

		vis_padd16(TMP8, TMP30, TMP8);

		vis_padd16(TMP10, TMP32, TMP10);
		vis_pack16(TMP8, DST_0);

		vis_pack16(TMP10, DST_1);
		vis_st64(DST_0, dest[0]);
		dest += stride;

		vis_padd16(REF_S4, TMP22, TMP12);

		vis_padd16(REF_S6, TMP24, TMP14);

		vis_padd16(TMP12, TMP26, TMP12);

		vis_padd16(TMP14, TMP28, TMP14);

		vis_padd16(TMP12, REF_0, TMP12);

		vis_padd16(TMP14, REF_2, TMP14);
		vis_pack16(TMP12, DST_2);

		vis_pack16(TMP14, DST_3);
		vis_st64(DST_2, dest[0]);
		dest += stride;
	} while (--height);
}

MPEG2_MC_EXTERN(vis);

#endif  /* !(ARCH_SPARC) */