changeset 209:c0d8ecae7ac5 libavcodec

(commit by michael) faster simple idct in MMX
author arpi_esp
date Thu, 17 Jan 2002 20:00:41 +0000
parents 2eb04d6be309
children c2b6d68a0671
files dsputil.c i386/simple_idct_mmx.c
diffstat 2 files changed, 970 insertions(+), 1142 deletions(-) [+]
line wrap: on
line diff
--- a/dsputil.c	Tue Jan 15 22:22:41 2002 +0000
+++ b/dsputil.c	Thu Jan 17 20:00:41 2002 +0000
@@ -71,16 +71,16 @@
     38, 46, 54, 62, 39, 47, 55, 63,
 };
 
-
+/* Input permutation for the simple_idct_mmx */
 static UINT8 simple_mmx_permutation[64]={
-	0x00, 0x08, 0x01, 0x09, 0x04, 0x0C, 0x05, 0x0D,
-	0x10, 0x18, 0x11, 0x19, 0x14, 0x1C, 0x15, 0x1D,
-	0x02, 0x0A, 0x03, 0x0B, 0x06, 0x0E, 0x07, 0x0F,
-	0x12, 0x1A, 0x13, 0x1B, 0x16, 0x1E, 0x17, 0x1F,
-	0x20, 0x28, 0x21, 0x29, 0x24, 0x2C, 0x25, 0x2D,
-	0x30, 0x38, 0x31, 0x39, 0x34, 0x3C, 0x35, 0x3D,
-	0x22, 0x2A, 0x23, 0x2B, 0x26, 0x2E, 0x27, 0x2F,
-	0x32, 0x3A, 0x33, 0x3B, 0x36, 0x3E, 0x37, 0x3F,
+	0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
+	0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
+	0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
+	0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
+	0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
+	0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
+	0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
+	0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 };
 
 /* used to skip zeros at the end */
--- a/i386/simple_idct_mmx.c	Tue Jan 15 22:22:41 2002 +0000
+++ b/i386/simple_idct_mmx.c	Thu Jan 17 20:00:41 2002 +0000
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
+    Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -43,26 +43,30 @@
 //	0, 0, 0, 0,
 //	0, 0, 0, 0,
 
-	 C4,  C2,  C4,  C2,
-	 C4,  C6,  C4,  C6,
-	 C1,  C3,  C1,  C3,
-	 C5,  C7,  C5,  C7,
-
-	 C4,  C6,  C4,  C6,
-	-C4, -C2, -C4, -C2,
-	 C3, -C7,  C3, -C7,
-	-C1, -C5, -C1, -C5,
+ C4,  C4,  C4,  C4,
+ C4, -C4,  C4, -C4,
+ 
+ C2,  C6,  C2,  C6,
+ C6, -C2,  C6, -C2,
+ 
+ C1,  C3,  C1,  C3,
+ C5,  C7,  C5,  C7,
+ 
+ C3, -C7,  C3, -C7,
+-C1, -C5, -C1, -C5,
+ 
+ C5, -C1,  C5, -C1,
+ C7,  C3,  C7,  C3,
+ 
+ C7, -C5,  C7, -C5,
+ C3, -C1,  C3, -C1
+};
 
-	 C4, -C6,  C4, -C6,
-	-C4,  C2, -C4,  C2,
-	 C5, -C1,  C5, -C1,
-	 C7,  C3,  C7,  C3,
+static void unused_var_killer(){
+	int a= wm1010 + d40000;
+	temp[0]=a;
+}
 
-	 C4, -C2,  C4, -C2,
-	 C4, -C6,  C4, -C6,
-	 C7, -C5,  C7, -C5,
-	 C3, -C1,  C3, -C1
-	};
 #if 0
 static void inline idctCol (int16_t * col, int16_t *input)
 {
@@ -188,107 +192,157 @@
 
 static inline void idct(int16_t *block)
 {
-	int i;
-//for(i=0; i<64; i++) temp[i]= block[ block_permute_op(i) ];
-//for(i=0; i<64; i++) temp[block_permute_op(i)]= block[ i ];
-//for(i=0; i<64; i++) block[i]= temp[i];
-//block_permute(block);
-/*
-idctRow(temp, block);
-idctRow(temp+16, block+16);
-idctRow(temp+1, block+2);
-idctRow(temp+17, block+18);
-idctRow(temp+32, block+32);
-idctRow(temp+48, block+48);
-idctRow(temp+33, block+34);
-idctRow(temp+49, block+50);
-*/
+	asm volatile(
+#if 0 //Alternative, simpler variant
 
-	asm volatile(
-//		"lea 64(%0), %%eax		\n\t"
-//r0,r2,R0,R2	r4,r6,R4,R6	r1,r3,R1,R3	r5,r7,R5,R7
-//src0		src4		src1		src5
-//r0,R0,r7,R7	r1,R1,r6,R6	r2,R2,r5,R5	r3,R3,r4,R4
-//dst0		dst1		dst2		dst3
-#if 0 //Alternative, simpler variant
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
+#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
 	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
 	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
 	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq 56(%2), %%mm5			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	#rounder ", %%mm0			\n\t"\
+	"paddd %%mm0, %%mm1			\n\t" /* A1		a1 */\
+	"paddd %%mm0, %%mm0			\n\t" \
+	"psubd %%mm1, %%mm0			\n\t" /* A2		a2 */\
+	"pmaddwd 64(%2), %%mm2			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm5, %%mm7			\n\t" /* B0		b0 */\
+	"movq 72(%2), %%mm5			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
+	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"paddd %%mm2, %%mm5			\n\t" /* B1		b1 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"movq %%mm1, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm5, %%mm1			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm5, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm1, %%mm7			\n\t" /* A1+B1	a1+b1	A0+B0	a0+b0 */\
+	"packssdw %%mm4, %%mm2			\n\t" /* A0-B0	a0-b0	A1-B1	a1-b1 */\
+	"movq %%mm7, " #dst "			\n\t"\
+	"movq " #src1 ", %%mm1			\n\t" /* R3	R1	r3	r1 */\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"movq %%mm2, 24+" #dst "		\n\t"\
+	"pmaddwd %%mm1, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"movq 88(%2), %%mm7			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd 96(%2), %%mm1			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
+	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
+	"movq %%mm0, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm7, %%mm4			\n\t" /* B2		b2 */\
+	"paddd %%mm4, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm0			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm1, %%mm3			\n\t" /* B3		b3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm4			\n\t" /* a3-B3		a3-b3 */\
+	"psrad $" #shift ", %%mm6		\n\t"\
+	"packssdw %%mm6, %%mm2			\n\t" /* A3+B3	a3+b3	A2+B2	a2+b2 */\
+	"movq %%mm2, 8+" #dst "			\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"packssdw %%mm0, %%mm4			\n\t" /* A2-B2	a2-b2	A3-B3	a3-b3 */\
+	"movq %%mm4, 16+" #dst "		\n\t"\
+
+#define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
+	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
+	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
 	#rounder ", %%mm4			\n\t"\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B0		b0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	#rounder ", %%mm0			\n\t"\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
+	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq %%mm0, %%mm5			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1		a1 */\
+	"psubd %%mm1, %%mm5			\n\t" /* A2		a2 */\
+	"movq 56(%2), %%mm1			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	"pmaddwd 64(%2), %%mm2			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm1, %%mm7			\n\t" /* B0		b0 */\
+	"movq 72(%2), %%mm1			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
 	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"paddd %%mm2, %%mm1			\n\t" /* B1		b1 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"movq %%mm0, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm1, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm7, %%mm7			\n\t" /* A0+B0	a0+b0 */\
+	"movd %%mm7, " #dst "			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t" /* A1+B1	a1+b1 */\
+	"movd %%mm0, 16+" #dst "		\n\t"\
+	"packssdw %%mm2, %%mm2			\n\t" /* A1-B1	a1-b1 */\
+	"movd %%mm2, 96+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A0-B0	a0-b0 */\
+	"movd %%mm4, 112+" #dst "		\n\t"\
+	"movq " #src1 ", %%mm0			\n\t" /* R3	R1	r3	r1 */\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"movq 88(%2), %%mm7			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd 96(%2), %%mm0			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
+	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
+	"movq %%mm5, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm7, %%mm4			\n\t" /* B2		b2 */\
+	"paddd %%mm4, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm5			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"psrad $" #shift ", %%mm5		\n\t"\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm0, %%mm3			\n\t" /* B3		b3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm4			\n\t" /* a3-B3		a3-b3 */\
 	"psrad $" #shift ", %%mm6		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0(%%mm6, %%mm4, dst) \
-\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A1		a1 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE1(%%mm6, %%mm4, dst, %%mm7) \
-\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-	#rounder ", %%mm0			\n\t"\
-	"paddd %%mm3, %%mm2			\n\t" /* B3		b3 */\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm2		\n\t"\
-	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)
+	"packssdw %%mm2, %%mm2			\n\t" /* A2+B2	a2+b2 */\
+	"packssdw %%mm6, %%mm6			\n\t" /* A3+B3	a3+b3 */\
+	"movd %%mm2, 32+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A3-B3	a3-b3 */\
+	"packssdw %%mm5, %%mm5			\n\t" /* A2-B2	a2-b2 */\
+	"movd %%mm6, 48+" #dst "		\n\t"\
+	"movd %%mm4, 64+" #dst "		\n\t"\
+	"movd %%mm5, 80+" #dst "		\n\t"\
 
-#define DC_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
+	
+#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
 	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
 	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
 	"movq wm1010, %%mm4			\n\t"\
@@ -300,231 +354,103 @@
 	"movd %%mm4, %%eax			\n\t"\
 	"orl %%eax, %%eax			\n\t"\
 	"jz 1f					\n\t"\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B0		b0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0(%%mm6, %%mm4, dst) \
-\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A1		a1 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
 	#rounder ", %%mm4			\n\t"\
-\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq 56(%2), %%mm5			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	#rounder ", %%mm0			\n\t"\
+	"paddd %%mm0, %%mm1			\n\t" /* A1		a1 */\
+	"paddd %%mm0, %%mm0			\n\t" \
+	"psubd %%mm1, %%mm0			\n\t" /* A2		a2 */\
+	"pmaddwd 64(%2), %%mm2			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm5, %%mm7			\n\t" /* B0		b0 */\
+	"movq 72(%2), %%mm5			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
+	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"paddd %%mm2, %%mm5			\n\t" /* B1		b1 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE1(%%mm6, %%mm4, dst, %%mm7) \
-\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
+	"movq %%mm1, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm5, %%mm1			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm5, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm1, %%mm7			\n\t" /* A1+B1	a1+b1	A0+B0	a0+b0 */\
+	"packssdw %%mm4, %%mm2			\n\t" /* A0-B0	a0-b0	A1-B1	a1-b1 */\
+	"movq %%mm7, " #dst "			\n\t"\
+	"movq " #src1 ", %%mm1			\n\t" /* R3	R1	r3	r1 */\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"movq %%mm2, 24+" #dst "		\n\t"\
+	"pmaddwd %%mm1, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"movq 88(%2), %%mm7			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd 96(%2), %%mm1			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
 	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-	#rounder ", %%mm0			\n\t"\
-	"paddd %%mm3, %%mm2			\n\t" /* B3		b3 */\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
+	"movq %%mm0, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm7, %%mm4			\n\t" /* B2		b2 */\
+	"paddd %%mm4, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm0			\n\t" /* a2-B2		a2-b2 */\
 	"psrad $" #shift ", %%mm2		\n\t"\
 	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm1, %%mm3			\n\t" /* B3		b3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm4			\n\t" /* a3-B3		a3-b3 */\
+	"psrad $" #shift ", %%mm6		\n\t"\
+	"packssdw %%mm6, %%mm2			\n\t" /* A3+B3	a3+b3	A2+B2	a2+b2 */\
+	"movq %%mm2, 8+" #dst "			\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"packssdw %%mm0, %%mm4			\n\t" /* A2-B2	a2-b2	A3-B3	a3-b3 */\
+	"movq %%mm4, 16+" #dst "		\n\t"\
 	"jmp 2f					\n\t"\
 	"1:					\n\t"\
-	WRITE3(%%mm0, dst)\
-	"2:					\n\t"\
+	"pslld $16, %%mm0			\n\t"\
+	"#paddd d40000, %%mm0			\n\t"\
+	"psrad $13, %%mm0			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t"\
+	"movq %%mm0, " #dst "			\n\t"\
+	"movq %%mm0, 8+" #dst "			\n\t"\
+	"movq %%mm0, 16+" #dst "		\n\t"\
+	"movq %%mm0, 24+" #dst "		\n\t"\
+	"2:					\n\t"
 
 
-#define WRITE0(s0, s7, dst)\
-	"movq " #s0 ", " #dst "			\n\t" /* R0		r0 */\
-	"movq " #s7 ", 24+" #dst "		\n\t" /* R7		r7 */
-
-#define WRITE1(s1, s6, dst, tmp)\
-	"movq " #dst ", " #tmp "		\n\t" /* R0		r0 */\
-	"packssdw " #s1 ", " #tmp "		\n\t" /* R1	r1	R0	r0*/\
-	"movq " #tmp ", " #dst "		\n\t"\
-	"movq 24+" #dst ", " #tmp "		\n\t" /* R7		r7 */\
-	"packssdw " #tmp ", " #s6 "		\n\t" /* R7	r7	R6	r6*/\
-	"movq " #s6 ", 24+" #dst "		\n\t"
-
-#define WRITE2(s2, s5, s3, s4, dst)\
-	"packssdw " #s3 ", " #s2 "		\n\t" /* R3	r3	R2	r2*/\
-	"packssdw " #s5 ", " #s4 "		\n\t" /* R5	r5	R4	r4*/\
-	"movq " #s2 ", 8+" #dst "		\n\t"\
-	"movq " #s4 ", 16+" #dst "		\n\t"
-
-#define WRITE3(a, dst)\
-	"pslld $16, " #a "			\n\t"\
-	"psrad $13, " #a "			\n\t"\
-	"packssdw " #a ", " #a "		\n\t"\
-	"movq " #a ", " #dst "			\n\t"\
-	"movq " #a ", 8+" #dst "		\n\t"\
-	"movq " #a ", 16+" #dst "		\n\t"\
-	"movq " #a ", 24+" #dst "		\n\t"\
+//IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
+ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
+/*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
+ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
+ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
 
-//IDCT_CORE(          src0,   src4,   src1,   src5,    dst,   rounder, shift)
-IDCT_CORE(            (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
-/*
-DC_COND_IDCT_CORE(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
-DC_COND_IDCT_CORE(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
-DC_COND_IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
-*/
-IDCT_CORE(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
-IDCT_CORE(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
-IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
+DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
+DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
+DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
 
-#undef WRITE0
-#undef WRITE1
-#undef WRITE2
-
-#define WRITE0(s0, s7, dst)\
-	"packssdw " #s0 ", " #s0 "		\n\t" /* C0, c0, C0, c0 */\
-	"packssdw " #s7 ", " #s7 "		\n\t" /* C7, c7, C7, c7 */\
-	"movd " #s0 ", " #dst "			\n\t" /* C0, c0 */\
-	"movd " #s7 ", 112+" #dst "		\n\t" /* C7, c7 */
 
-#define WRITE1(s1, s6, dst, tmp)\
-	"packssdw " #s1 ", " #s1 "		\n\t" /* C1, c1, C1, c1 */\
-	"packssdw " #s6 ", " #s6 "		\n\t" /* C6, c6, C6, c6 */\
-	"movd " #s1 ", 16+" #dst "		\n\t" /* C1, c1 */\
-	"movd " #s6 ", 96+" #dst "		\n\t" /* C6, c6 */
-
-#define WRITE2(s2, s5, s3, s4, dst)\
-	"packssdw " #s2 ", " #s2 "		\n\t" /* C2, c2, C2, c2 */\
-	"packssdw " #s3 ", " #s3 "		\n\t" /* C3, c3, C3, c3 */\
-	"movd " #s2 ", 32+" #dst "		\n\t" /* C2, c2 */\
-	"movd " #s3 ", 48+" #dst "		\n\t" /* C3, c3 */\
-	"packssdw " #s4 ", " #s4 "		\n\t" /* C4, c4, C4, c4 */\
-	"packssdw " #s5 ", " #s5 "		\n\t" /* C5, c5, C5, c5 */\
-	"movd " #s4 ", 64+" #dst "		\n\t" /* C4, c4 */\
-	"movd " #s5 ", 80+" #dst "		\n\t" /* C5, c5 */\
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+//IDCT(      src0,   src4,   src1,    src5,    dst, rounder, shift)
+COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 
 #else
 
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
-	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
-	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B0		b0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0(%%mm6, %%mm4, dst) \
-\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A1		a1 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE1(%%mm6, %%mm4, dst, %%mm7) \
-\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-	#rounder ", %%mm0			\n\t"\
-	"paddd %%mm3, %%mm2			\n\t" /* B3		b3 */\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm2		\n\t"\
-	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)
-
-#define DC_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
+#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
 	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
 	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
 	"movq wm1010, %%mm4			\n\t"\
@@ -536,920 +462,822 @@
 	"movd %%mm4, %%eax			\n\t"\
 	"orl %%eax, %%eax			\n\t"\
 	"jz 1f					\n\t"\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B0		b0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0(%%mm6, %%mm4, dst) \
-\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A1		a1 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
 	#rounder ", %%mm4			\n\t"\
-\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq 56(%2), %%mm5			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	#rounder ", %%mm0			\n\t"\
+	"paddd %%mm0, %%mm1			\n\t" /* A1		a1 */\
+	"paddd %%mm0, %%mm0			\n\t" \
+	"psubd %%mm1, %%mm0			\n\t" /* A2		a2 */\
+	"pmaddwd 64(%2), %%mm2			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm5, %%mm7			\n\t" /* B0		b0 */\
+	"movq 72(%2), %%mm5			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
+	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"paddd %%mm2, %%mm5			\n\t" /* B1		b1 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE1(%%mm6, %%mm4, dst, %%mm7) \
-\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
+	"movq %%mm1, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm5, %%mm1			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm5, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm1, %%mm7			\n\t" /* A1+B1	a1+b1	A0+B0	a0+b0 */\
+	"packssdw %%mm4, %%mm2			\n\t" /* A0-B0	a0-b0	A1-B1	a1-b1 */\
+	"movq %%mm7, " #dst "			\n\t"\
+	"movq " #src1 ", %%mm1			\n\t" /* R3	R1	r3	r1 */\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"movq %%mm2, 24+" #dst "		\n\t"\
+	"pmaddwd %%mm1, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"movq 88(%2), %%mm7			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd 96(%2), %%mm1			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
 	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-	#rounder ", %%mm0			\n\t"\
-	"paddd %%mm3, %%mm2			\n\t" /* B3		b3 */\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
+	"movq %%mm0, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm7, %%mm4			\n\t" /* B2		b2 */\
+	"paddd %%mm4, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm0			\n\t" /* a2-B2		a2-b2 */\
 	"psrad $" #shift ", %%mm2		\n\t"\
 	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm1, %%mm3			\n\t" /* B3		b3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm4			\n\t" /* a3-B3		a3-b3 */\
+	"psrad $" #shift ", %%mm6		\n\t"\
+	"packssdw %%mm6, %%mm2			\n\t" /* A3+B3	a3+b3	A2+B2	a2+b2 */\
+	"movq %%mm2, 8+" #dst "			\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"packssdw %%mm0, %%mm4			\n\t" /* A2-B2	a2-b2	A3-B3	a3-b3 */\
+	"movq %%mm4, 16+" #dst "		\n\t"\
 	"jmp 2f					\n\t"\
-	"#.balign 16				\n\t"\
 	"1:					\n\t"\
-	WRITE3(%%mm0, dst)\
-	"2:					\n\t"\
+	"pslld $16, %%mm0			\n\t"\
+	"paddd d40000, %%mm0			\n\t"\
+	"psrad $13, %%mm0			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t"\
+	"movq %%mm0, " #dst "			\n\t"\
+	"movq %%mm0, 8+" #dst "			\n\t"\
+	"movq %%mm0, 16+" #dst "		\n\t"\
+	"movq %%mm0, 24+" #dst "		\n\t"\
+	"2:					\n\t"
 
-#define Z_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift, bt) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
+#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
 	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
 	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
 	"movq %%mm0, %%mm4			\n\t"\
 	"por %%mm1, %%mm4			\n\t"\
 	"por %%mm2, %%mm4			\n\t"\
 	"por %%mm3, %%mm4			\n\t"\
-	"packssdw %%mm4, %%mm4			\n\t"\
+	"packssdw %%mm4,%%mm4			\n\t"\
 	"movd %%mm4, %%eax			\n\t"\
 	"orl %%eax, %%eax			\n\t"\
 	"jz " #bt "				\n\t"\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq 56(%2), %%mm5			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	#rounder ", %%mm0			\n\t"\
+	"paddd %%mm0, %%mm1			\n\t" /* A1		a1 */\
+	"paddd %%mm0, %%mm0			\n\t" \
+	"psubd %%mm1, %%mm0			\n\t" /* A2		a2 */\
+	"pmaddwd 64(%2), %%mm2			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm5, %%mm7			\n\t" /* B0		b0 */\
+	"movq 72(%2), %%mm5			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
+	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"paddd %%mm2, %%mm5			\n\t" /* B1		b1 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"movq %%mm1, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm5, %%mm1			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm5, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm1, %%mm7			\n\t" /* A1+B1	a1+b1	A0+B0	a0+b0 */\
+	"packssdw %%mm4, %%mm2			\n\t" /* A0-B0	a0-b0	A1-B1	a1-b1 */\
+	"movq %%mm7, " #dst "			\n\t"\
+	"movq " #src1 ", %%mm1			\n\t" /* R3	R1	r3	r1 */\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"movq %%mm2, 24+" #dst "		\n\t"\
+	"pmaddwd %%mm1, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"movq 88(%2), %%mm7			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd 96(%2), %%mm1			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
+	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
+	"movq %%mm0, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm7, %%mm4			\n\t" /* B2		b2 */\
+	"paddd %%mm4, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm0			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm1, %%mm3			\n\t" /* B3		b3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm4			\n\t" /* a3-B3		a3-b3 */\
+	"psrad $" #shift ", %%mm6		\n\t"\
+	"packssdw %%mm6, %%mm2			\n\t" /* A3+B3	a3+b3	A2+B2	a2+b2 */\
+	"movq %%mm2, 8+" #dst "			\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"packssdw %%mm0, %%mm4			\n\t" /* A2-B2	a2-b2	A3-B3	a3-b3 */\
+	"movq %%mm4, 16+" #dst "		\n\t"\
+
+#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
+	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
+	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
 	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq 56(%2), %%mm5			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	#rounder ", %%mm0			\n\t"\
+	"paddd %%mm0, %%mm1			\n\t" /* A1		a1 */\
+	"paddd %%mm0, %%mm0			\n\t" \
+	"psubd %%mm1, %%mm0			\n\t" /* A2		a2 */\
+	"pmaddwd 64(%2), %%mm2			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm5, %%mm7			\n\t" /* B0		b0 */\
+	"movq 72(%2), %%mm5			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
+	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"paddd %%mm2, %%mm5			\n\t" /* B1		b1 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"movq %%mm1, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm5, %%mm1			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm5, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm1, %%mm7			\n\t" /* A1+B1	a1+b1	A0+B0	a0+b0 */\
+	"packssdw %%mm4, %%mm2			\n\t" /* A0-B0	a0-b0	A1-B1	a1-b1 */\
+	"movq %%mm7, " #dst "			\n\t"\
+	"movq " #src1 ", %%mm1			\n\t" /* R3	R1	r3	r1 */\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"movq %%mm2, 24+" #dst "		\n\t"\
+	"pmaddwd %%mm1, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"movq 88(%2), %%mm7			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd 96(%2), %%mm1			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
+	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
+	"movq %%mm0, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm7, %%mm4			\n\t" /* B2		b2 */\
+	"paddd %%mm4, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm0			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm1, %%mm3			\n\t" /* B3		b3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm4			\n\t" /* a3-B3		a3-b3 */\
+	"psrad $" #shift ", %%mm6		\n\t"\
+	"packssdw %%mm6, %%mm2			\n\t" /* A3+B3	a3+b3	A2+B2	a2+b2 */\
+	"movq %%mm2, 8+" #dst "			\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"packssdw %%mm0, %%mm4			\n\t" /* A2-B2	a2-b2	A3-B3	a3-b3 */\
+	"movq %%mm4, 16+" #dst "		\n\t"\
+
+//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
+DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
+Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
+Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
+
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
+	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
+	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
 	#rounder ", %%mm4			\n\t"\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B0		b0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	#rounder ", %%mm0			\n\t"\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
+	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq %%mm0, %%mm5			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1		a1 */\
+	"psubd %%mm1, %%mm5			\n\t" /* A2		a2 */\
+	"movq 56(%2), %%mm1			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	"pmaddwd 64(%2), %%mm2			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm1, %%mm7			\n\t" /* B0		b0 */\
+	"movq 72(%2), %%mm1			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
 	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"paddd %%mm2, %%mm1			\n\t" /* B1		b1 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"movq %%mm0, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm1, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm7, %%mm7			\n\t" /* A0+B0	a0+b0 */\
+	"movd %%mm7, " #dst "			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t" /* A1+B1	a1+b1 */\
+	"movd %%mm0, 16+" #dst "		\n\t"\
+	"packssdw %%mm2, %%mm2			\n\t" /* A1-B1	a1-b1 */\
+	"movd %%mm2, 96+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A0-B0	a0-b0 */\
+	"movd %%mm4, 112+" #dst "		\n\t"\
+	"movq " #src1 ", %%mm0			\n\t" /* R3	R1	r3	r1 */\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"movq 88(%2), %%mm7			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd 96(%2), %%mm0			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
+	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
+	"movq %%mm5, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm7, %%mm4			\n\t" /* B2		b2 */\
+	"paddd %%mm4, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm5			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"psrad $" #shift ", %%mm5		\n\t"\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm0, %%mm3			\n\t" /* B3		b3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm4			\n\t" /* a3-B3		a3-b3 */\
 	"psrad $" #shift ", %%mm6		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0(%%mm6, %%mm4, dst) \
-\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A1		a1 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE1(%%mm6, %%mm4, dst, %%mm7) \
-\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-	#rounder ", %%mm0			\n\t"\
-	"paddd %%mm3, %%mm2			\n\t" /* B3		b3 */\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm2		\n\t"\
-	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\
+	"packssdw %%mm2, %%mm2			\n\t" /* A2+B2	a2+b2 */\
+	"packssdw %%mm6, %%mm6			\n\t" /* A3+B3	a3+b3 */\
+	"movd %%mm2, 32+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A3-B3	a3-b3 */\
+	"packssdw %%mm5, %%mm5			\n\t" /* A2-B2	a2-b2 */\
+	"movd %%mm6, 48+" #dst "		\n\t"\
+	"movd %%mm4, 64+" #dst "		\n\t"\
+	"movd %%mm5, 80+" #dst "		\n\t"
 
 
-#define WRITE0(s0, s7, dst)\
-	"movq " #s0 ", " #dst "			\n\t" /* R0		r0 */\
-	"movq " #s7 ", 24+" #dst "		\n\t" /* R7		r7 */
-
-#define WRITE1(s1, s6, dst, tmp)\
-	"movq " #dst ", " #tmp "		\n\t" /* R0		r0 */\
-	"packssdw " #s1 ", " #tmp "		\n\t" /* R1	r1	R0	r0*/\
-	"movq " #tmp ", " #dst "		\n\t"\
-	"movq 24+" #dst ", " #tmp "		\n\t" /* R7		r7 */\
-	"packssdw " #tmp ", " #s6 "		\n\t" /* R7	r7	R6	r6*/\
-	"movq " #s6 ", 24+" #dst "		\n\t"
-
-#define WRITE2(s2, s5, s3, s4, dst)\
-	"packssdw " #s3 ", " #s2 "		\n\t" /* R3	r3	R2	r2*/\
-	"packssdw " #s5 ", " #s4 "		\n\t" /* R5	r5	R4	r4*/\
-	"movq " #s2 ", 8+" #dst "		\n\t"\
-	"movq " #s4 ", 16+" #dst "		\n\t"
-
-#define WRITE3(a, dst)\
-	"pslld $16, " #a "			\n\t"\
-	"paddd d40000, " #a "			\n\t"\
-	"psrad $13, " #a "			\n\t"\
-	"packssdw " #a ", " #a "		\n\t"\
-	"movq " #a ", " #dst "			\n\t"\
-	"movq " #a ", 8+" #dst "		\n\t"\
-	"movq " #a ", 16+" #dst "		\n\t"\
-	"movq " #a ", 24+" #dst "		\n\t"\
-
-#define WRITE0b(s0, s7, dst)\
-	"packssdw " #s0 ", " #s0 "		\n\t" /* C0, c0, C0, c0 */\
-	"packssdw " #s7 ", " #s7 "		\n\t" /* C7, c7, C7, c7 */\
-	"movd " #s0 ", " #dst "			\n\t" /* C0, c0 */\
-	"movd " #s7 ", 112+" #dst "		\n\t" /* C7, c7 */
-
-#define WRITE1b(s1, s6, dst, tmp)\
-	"packssdw " #s1 ", " #s1 "		\n\t" /* C1, c1, C1, c1 */\
-	"packssdw " #s6 ", " #s6 "		\n\t" /* C6, c6, C6, c6 */\
-	"movd " #s1 ", 16+" #dst "		\n\t" /* C1, c1 */\
-	"movd " #s6 ", 96+" #dst "		\n\t" /* C6, c6 */
-
-#define WRITE2b(s2, s5, s3, s4, dst)\
-	"packssdw " #s2 ", " #s2 "		\n\t" /* C2, c2, C2, c2 */\
-	"packssdw " #s3 ", " #s3 "		\n\t" /* C3, c3, C3, c3 */\
-	"movd " #s2 ", 32+" #dst "		\n\t" /* C2, c2 */\
-	"movd " #s3 ", 48+" #dst "		\n\t" /* C3, c3 */\
-	"packssdw " #s4 ", " #s4 "		\n\t" /* C4, c4, C4, c4 */\
-	"packssdw " #s5 ", " #s5 "		\n\t" /* C5, c5, C5, c5 */\
-	"movd " #s4 ", 64+" #dst "		\n\t" /* C4, c4 */\
-	"movd " #s5 ", 80+" #dst "		\n\t" /* C5, c5 */\
-
-
-//IDCT_CORE(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
-DC_COND_IDCT_CORE(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
-Z_COND_IDCT_CORE(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
-Z_COND_IDCT_CORE(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
-Z_COND_IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
-
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
-	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
-	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B0		b0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0b(%%mm6, %%mm4, dst) \
-\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A1		a1 */\
-\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE1b(%%mm6, %%mm4, dst, %%mm7) \
-\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-	"paddd %%mm3, %%mm2			\n\t" /* B3		b3 */\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm2		\n\t"\
-	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 	"jmp 9f					\n\t"
 
 	"#.balign 16				\n\t"\
 	"4:					\n\t"
-Z_COND_IDCT_CORE(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
-Z_COND_IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
+Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
 
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
 	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	#rounder ", %%mm0			\n\t"\
 	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm7		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0b(%%mm7, %%mm4, dst) \
-\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq %%mm0, %%mm5			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1		a1 */\
+	"psubd %%mm1, %%mm5			\n\t" /* A2		a2 */\
+	"movq 56(%2), %%mm1			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
 	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
 	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A1		a1 */\
-\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm4, %%mm7			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm7, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm7		\n\t"\
+	"paddd %%mm4, %%mm1			\n\t" /* A0+B0		a0+b0 */\
+	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
+	"psubd %%mm1, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE1b(%%mm7, %%mm4, dst, %%mm6) \
-\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm4, %%mm7			\n\t" /* A1+B1		a1+b1 */\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm7, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
-	"psrad $" #shift ", %%mm7		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-	"paddd %%mm0, %%mm3			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm3, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm3		\n\t"\
+	"movq %%mm0, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm7, %%mm0			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm7, %%mm2			\n\t" /* A1-B1		a1-b1 */\
 	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2b(%%mm7, %%mm4, %%mm3, %%mm0, dst)
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm1, %%mm1			\n\t" /* A0+B0	a0+b0 */\
+	"movd %%mm1, " #dst "			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t" /* A1+B1	a1+b1 */\
+	"movd %%mm0, 16+" #dst "		\n\t"\
+	"packssdw %%mm2, %%mm2			\n\t" /* A1-B1	a1-b1 */\
+	"movd %%mm2, 96+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A0-B0	a0-b0 */\
+	"movd %%mm4, 112+" #dst "		\n\t"\
+	"movq 88(%2), %%mm1			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
+	"movq %%mm5, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm1, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm1, %%mm5			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"psrad $" #shift ", %%mm5		\n\t"\
+	"movq %%mm6, %%mm1			\n\t" /* A3		a3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm1			\n\t" /* a3-B3		a3-b3 */\
+	"psrad $" #shift ", %%mm6		\n\t"\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"packssdw %%mm2, %%mm2			\n\t" /* A2+B2	a2+b2 */\
+	"packssdw %%mm6, %%mm6			\n\t" /* A3+B3	a3+b3 */\
+	"movd %%mm2, 32+" #dst "		\n\t"\
+	"packssdw %%mm1, %%mm1			\n\t" /* A3-B3	a3-b3 */\
+	"packssdw %%mm5, %%mm5			\n\t" /* A2-B2	a2-b2 */\
+	"movd %%mm6, 48+" #dst "		\n\t"\
+	"movd %%mm1, 64+" #dst "		\n\t"\
+	"movd %%mm5, 80+" #dst "		\n\t"	
 
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 	"jmp 9f					\n\t"
 
 	"#.balign 16				\n\t"\
 	"6:					\n\t"
-Z_COND_IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
 
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	#rounder ", %%mm0			\n\t"\
+	"movq %%mm0, %%mm5			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 56(%2), %%mm1			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm1			\n\t" /* A0+B0		a0+b0 */\
+	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
+	"psubd %%mm1, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"movq %%mm0, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm7, %%mm0			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm7, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm1, %%mm1			\n\t" /* A0+B0	a0+b0 */\
+	"movd %%mm1, " #dst "			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t" /* A1+B1	a1+b1 */\
+	"movd %%mm0, 16+" #dst "		\n\t"\
+	"packssdw %%mm2, %%mm2			\n\t" /* A1-B1	a1-b1 */\
+	"movd %%mm2, 96+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A0-B0	a0-b0 */\
+	"movd %%mm4, 112+" #dst "		\n\t"\
+	"movq 88(%2), %%mm1			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
+	"movq %%mm5, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm1, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm1, %%mm5			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"psrad $" #shift ", %%mm5		\n\t"\
+	"movq %%mm6, %%mm1			\n\t" /* A3		a3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm1			\n\t" /* a3-B3		a3-b3 */\
+	"psrad $" #shift ", %%mm6		\n\t"\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"packssdw %%mm2, %%mm2			\n\t" /* A2+B2	a2+b2 */\
+	"packssdw %%mm6, %%mm6			\n\t" /* A3+B3	a3+b3 */\
+	"movd %%mm2, 32+" #dst "		\n\t"\
+	"packssdw %%mm1, %%mm1			\n\t" /* A3-B3	a3-b3 */\
+	"packssdw %%mm5, %%mm5			\n\t" /* A2-B2	a2-b2 */\
+	"movd %%mm6, 48+" #dst "		\n\t"\
+	"movd %%mm1, 64+" #dst "		\n\t"\
+	"movd %%mm5, 80+" #dst "		\n\t"	
+
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+	"jmp 9f					\n\t"
+
+	"#.balign 16				\n\t"\
+	"2:					\n\t"
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
+
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
 	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	#rounder ", %%mm0			\n\t"\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
+	"movq %%mm0, %%mm5			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 56(%2), %%mm1			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	"pmaddwd 64(%2), %%mm2			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm1, %%mm7			\n\t" /* B0		b0 */\
+	"movq 72(%2), %%mm1			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
+	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"paddd %%mm2, %%mm1			\n\t" /* B1		b1 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"movq %%mm0, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm1, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm7, %%mm7			\n\t" /* A0+B0	a0+b0 */\
+	"movd %%mm7, " #dst "			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t" /* A1+B1	a1+b1 */\
+	"movd %%mm0, 16+" #dst "		\n\t"\
+	"packssdw %%mm2, %%mm2			\n\t" /* A1-B1	a1-b1 */\
+	"movd %%mm2, 96+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A0-B0	a0-b0 */\
+	"movd %%mm4, 112+" #dst "		\n\t"\
+	"movq " #src1 ", %%mm0			\n\t" /* R3	R1	r3	r1 */\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"movq 88(%2), %%mm7			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd 96(%2), %%mm0			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
+	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
+	"movq %%mm5, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm7, %%mm4			\n\t" /* B2		b2 */\
+	"paddd %%mm4, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm5			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"psrad $" #shift ", %%mm5		\n\t"\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm0, %%mm3			\n\t" /* B3		b3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm4			\n\t" /* a3-B3		a3-b3 */\
+	"psrad $" #shift ", %%mm6		\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"packssdw %%mm2, %%mm2			\n\t" /* A2+B2	a2+b2 */\
+	"packssdw %%mm6, %%mm6			\n\t" /* A3+B3	a3+b3 */\
+	"movd %%mm2, 32+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A3-B3	a3-b3 */\
+	"packssdw %%mm5, %%mm5			\n\t" /* A2-B2	a2-b2 */\
+	"movd %%mm6, 48+" #dst "		\n\t"\
+	"movd %%mm4, 64+" #dst "		\n\t"\
+	"movd %%mm5, 80+" #dst "		\n\t"
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+	"jmp 9f					\n\t"
+
+	"#.balign 16				\n\t"\
+	"3:					\n\t"
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	#rounder ", %%mm0			\n\t"\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
+	"movq %%mm0, %%mm5			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 64(%2), %%mm3			\n\t"\
+	"pmaddwd %%mm2, %%mm3			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
 	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
 	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
 	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
 	"psrad $" #shift ", %%mm7		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0b(%%mm7, %%mm4, dst) \
-\
-	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-\
-	"movq 80(%2), %%mm4			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm5, %%mm7			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm5, %%mm5			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm7, %%mm5			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm7		\n\t"\
-	"psrad $" #shift ", %%mm5		\n\t"\
-	WRITE1b(%%mm7, %%mm5, dst, %%mm6) \
-\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm4, %%mm7			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm7, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
-	"psrad $" #shift ", %%mm7		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm0, %%mm3			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm3, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm3		\n\t"\
+	"movq %%mm0, %%mm1			\n\t" /* A1		a1 */\
+	"paddd %%mm3, %%mm0			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm3, %%mm1			\n\t" /* A1-B1		a1-b1 */\
 	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2b(%%mm7, %%mm4, %%mm3, %%mm0, dst)
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
-	"jmp 9f					\n\t"
-
-	"#.balign 16				\n\t"\
-	"2:					\n\t"
-Z_COND_IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
-
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
-	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B0		b0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"packssdw %%mm7, %%mm7			\n\t" /* A0+B0	a0+b0 */\
+	"movd %%mm7, " #dst "			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t" /* A1+B1	a1+b1 */\
+	"movd %%mm0, 16+" #dst "		\n\t"\
+	"packssdw %%mm1, %%mm1			\n\t" /* A1-B1	a1-b1 */\
+	"movd %%mm1, 96+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A0-B0	a0-b0 */\
+	"movd %%mm4, 112+" #dst "		\n\t"\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"pmaddwd %%mm2, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"pmaddwd 96(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
+	"movq %%mm5, %%mm1			\n\t" /* A2		a2 */\
+	"paddd %%mm4, %%mm1			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm5			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"psrad $" #shift ", %%mm5		\n\t"\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm2, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm2, %%mm4			\n\t" /* a3-B3		a3-b3 */\
 	"psrad $" #shift ", %%mm6		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0b(%%mm6, %%mm4, dst) \
-\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-\
-	"movq 80(%2), %%mm4			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm5, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm5, %%mm5			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm5			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm5		\n\t"\
-	WRITE1b(%%mm6, %%mm5, dst, %%mm7) \
-\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm3, %%mm2			\n\t" /* B3		b3 */\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm2		\n\t"\
-	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
-	"jmp 9f					\n\t"
+	"packssdw %%mm1, %%mm1			\n\t" /* A2+B2	a2+b2 */\
+	"packssdw %%mm6, %%mm6			\n\t" /* A3+B3	a3+b3 */\
+	"movd %%mm1, 32+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A3-B3	a3-b3 */\
+	"packssdw %%mm5, %%mm5			\n\t" /* A2-B2	a2-b2 */\
+	"movd %%mm6, 48+" #dst "		\n\t"\
+	"movd %%mm4, 64+" #dst "		\n\t"\
+	"movd %%mm5, 80+" #dst "		\n\t"
 
-	"#.balign 16				\n\t"\
-	"3:					\n\t"
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0b(%%mm6, %%mm4, dst) \
-\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-\
-	"movq 80(%2), %%mm4			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm5, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm5, %%mm5			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm5			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm5		\n\t"\
-	WRITE1b(%%mm6, %%mm5, dst, %%mm7) \
-\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm2		\n\t"\
-	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
 
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 	"jmp 9f					\n\t"
 
 	"#.balign 16				\n\t"\
 	"5:					\n\t"
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"movq %%mm4, %%mm6\n\t"\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"movq %%mm5, %%mm7\n\t"\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 8+" #src0 ", %%mm2		\n\t" /*2R2	R0	r2	r0 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /*2C2R2+C4R0	C2r2+C4r0 */\
-	"movq 8+" #src4 ", %%mm3		\n\t" /*2R6	R4	r6	r4 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /*2C6R6+C4R4	C6r6+C4r4 */\
-\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
 	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
+	#rounder ", %%mm0			\n\t"\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq %%mm0, %%mm5			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1		a1 */\
+	"psubd %%mm1, %%mm5			\n\t" /* A2		a2 */\
+	"movq 8+" #src0 ", %%mm2		\n\t" /* R4	R0	r4	r0 */\
+	"movq 8+" #src4 ", %%mm3		\n\t" /* R6	R2	r6	r2 */\
+	"movq 16(%2), %%mm1			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm2, %%mm1			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm7			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm7, %%mm2			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm7			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm3, %%mm7			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"pmaddwd 40(%2), %%mm3			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	#rounder ", %%mm1			\n\t"\
+	"paddd %%mm1, %%mm7			\n\t" /* A0		a0 */\
+	"paddd %%mm1, %%mm1			\n\t" /* 2C0		2c0 */\
+	#rounder ", %%mm2			\n\t"\
+	"psubd %%mm7, %%mm1			\n\t" /* A3		a3 */\
+	"paddd %%mm2, %%mm3			\n\t" /* A1		a1 */\
+	"paddd %%mm2, %%mm2			\n\t" /* 2C1		2c1 */\
+	"psubd %%mm3, %%mm2			\n\t" /* A2		a2 */\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-\
-	"paddd %%mm7, %%mm6			\n\t" /*2A0		a0 */\
-	"movq 56(%2), %%mm7			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"pmaddwd %%mm1, %%mm7			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
-\
-	"packssdw %%mm6, %%mm4			\n\t" /* C0, c0, C0, c0 */\
-	"movq 48(%2), %%mm6			\n\t" /* C6	C4	C6	C4 */\
-	"movq %%mm4, " #dst "			\n\t" /* C0, c0 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /*2C6R2+C4R0	C6r2+C4r0 */\
-\
-	"movq %%mm4, 112+" #dst "		\n\t" /* C0, c0 */\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm3, %%mm4			\n\t" /*2-C2R6-C4R4	-C2r6-C4r4 */\
-\
-	"paddd %%mm5, %%mm7			\n\t" /* A1		a1 */\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
 	"psrad $" #shift ", %%mm7		\n\t"\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-\
-	"paddd %%mm4, %%mm6			\n\t" /*2A1		a1 */\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-\
+	"psrad $" #shift ", %%mm3		\n\t"\
+	"packssdw %%mm7, %%mm4			\n\t" /* A0	a0 */\
+	"movq %%mm4, " #dst "			\n\t"\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"packssdw %%mm3, %%mm0			\n\t" /* A1	a1 */\
+	"movq %%mm0, 16+" #dst "		\n\t"\
+	"movq %%mm0, 96+" #dst "		\n\t"\
+	"movq %%mm4, 112+" #dst "		\n\t"\
+	"psrad $" #shift ", %%mm5		\n\t"\
 	"psrad $" #shift ", %%mm6		\n\t"\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"packssdw %%mm6, %%mm7			\n\t" /* C1, c1, C1, c1 */\
-\
-	"movq 80(%2), %%mm6			\n\t" /* -C6	C4	-C6	C4 */\
-	"movq %%mm7, 16+" #dst "		\n\t" /* C1, c1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /*2-C6R2+C4R0	-C6r2+C4r0 */\
-\
-	"movq %%mm7, 96+" #dst "		\n\t" /* C1, c1 */\
-	"movq 88(%2), %%mm7			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /*2C2R6-C4R4	C2r6-C4r4 */\
-\
-	"pmaddwd 112(%2), %%mm2			\n\t" /*2-C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-\
-	"pmaddwd 120(%2), %%mm3			\n\t" /*2-C6R6+C4R4	-C6r6+C4r4 */\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm7, %%mm6			\n\t" /*2A2		a2 */\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-\
-	"psrad $" #shift ", %%mm6		\n\t"\
-\
-	"packssdw %%mm6, %%mm4			\n\t" /* C2, c2, C2, c2 */\
-	"movq %%mm4, 32+" #dst "		\n\t" /* C2, c2 */\
-	"psrad $" #shift ", %%mm0		\n\t"\
-	"paddd %%mm3, %%mm2			\n\t" /*2A3		a3 */\
-\
-	"movq %%mm4, 80+" #dst "		\n\t" /* C2, c2 */\
 	"psrad $" #shift ", %%mm2		\n\t"\
-\
-	"packssdw %%mm2, %%mm0			\n\t" /* C3, c3, C3, c3 */\
-	"movq %%mm0, 48+" #dst "		\n\t" /* C3, c3 */\
-	"movq %%mm0, 64+" #dst "		\n\t" /* C3, c3 */\
+	"packssdw %%mm2, %%mm5			\n\t" /* A2-B2	a2-b2 */\
+	"movq %%mm5, 32+" #dst "		\n\t"\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"packssdw %%mm1, %%mm6			\n\t" /* A3+B3	a3+b3 */\
+	"movq %%mm6, 48+" #dst "		\n\t"\
+	"movq %%mm6, 64+" #dst "		\n\t"\
+	"movq %%mm5, 80+" #dst "		\n\t"	
+	
 
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-//IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-//IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 	"jmp 9f					\n\t"
 
 
 	"#.balign 16				\n\t"\
 	"1:					\n\t"
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
 	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	#rounder ", %%mm0			\n\t"\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
 	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq %%mm0, %%mm5			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1		a1 */\
+	"psubd %%mm1, %%mm5			\n\t" /* A2		a2 */\
+	"movq 64(%2), %%mm1			\n\t"\
+	"pmaddwd %%mm2, %%mm1			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
 	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0b(%%mm6, %%mm4, dst) \
-\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A1		a1 */\
-\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
+	"movq %%mm0, %%mm3			\n\t" /* A1		a1 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm1, %%mm3			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"psrad $" #shift ", %%mm3		\n\t"\
+	"packssdw %%mm7, %%mm7			\n\t" /* A0+B0	a0+b0 */\
+	"movd %%mm7, " #dst "			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t" /* A1+B1	a1+b1 */\
+	"movd %%mm0, 16+" #dst "		\n\t"\
+	"packssdw %%mm3, %%mm3			\n\t" /* A1-B1	a1-b1 */\
+	"movd %%mm3, 96+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A0-B0	a0-b0 */\
+	"movd %%mm4, 112+" #dst "		\n\t"\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"pmaddwd %%mm2, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"pmaddwd 96(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
+	"movq %%mm5, %%mm3			\n\t" /* A2		a2 */\
+	"paddd %%mm4, %%mm3			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm5			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm3		\n\t"\
+	"psrad $" #shift ", %%mm5		\n\t"\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm2, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm2, %%mm4			\n\t" /* a3-B3		a3-b3 */\
 	"psrad $" #shift ", %%mm6		\n\t"\
+	"packssdw %%mm3, %%mm3			\n\t" /* A2+B2	a2+b2 */\
+	"movd %%mm3, 32+" #dst "		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE1b(%%mm6, %%mm4, dst, %%mm7) \
-\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm2		\n\t"\
-	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
+	"packssdw %%mm6, %%mm6			\n\t" /* A3+B3	a3+b3 */\
+	"movd %%mm6, 48+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A3-B3	a3-b3 */\
+	"packssdw %%mm5, %%mm5			\n\t" /* A2-B2	a2-b2 */\
+	"movd %%mm4, 64+" #dst "		\n\t"\
+	"movd %%mm5, 80+" #dst "		\n\t"
+	
 
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 	"jmp 9f					\n\t"
 
 
 	"#.balign 16				\n\t"
 	"7:					\n\t"
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq 16(%2), %%mm2			\n\t" /* C2	C4	C2	C4 */\
-	"movq 8+" #src0 ", %%mm1		\n\t" /* R2	R0	r2	r0 */\
-	"pmaddwd %%mm0, %%mm2			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 16(%2), %%mm3			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm1, %%mm3			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-\
-	"movq 48(%2), %%mm4			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"movq 80(%2), %%mm6			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm6			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"movq 80(%2), %%mm7			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm1, %%mm7			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	#rounder ", %%mm4			\n\t"\
+	#rounder ", %%mm0			\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"movq 8+" #src0 ", %%mm2		\n\t" /* R4	R0	r4	r0 */\
+	"movq 16(%2), %%mm1			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm2, %%mm1			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm7			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm7, %%mm2			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm7			\n\t" /* C6	C2	C6	C2 */\
+	#rounder ", %%mm1			\n\t"\
+	#rounder ", %%mm2			\n\t"\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"packssdw %%mm1, %%mm4			\n\t" /* A0	a0 */\
+	"movq %%mm4, " #dst "			\n\t"\
 	"psrad $" #shift ", %%mm2		\n\t"\
-	"psrad $" #shift ", %%mm3		\n\t"\
-	"pmaddwd 112(%2), %%mm1			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"packssdw %%mm3, %%mm2			\n\t" /* C0, c0, C0, c0 */\
-	"movq %%mm2, " #dst "			\n\t" /* C0, c0 */\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	"psrad $" #shift ", %%mm5		\n\t"\
-	"movq %%mm2, 112+" #dst "		\n\t" /* C0, c0 */\
-	"packssdw %%mm5, %%mm4			\n\t" /* C1, c1, C1, c1 */\
-	"movq %%mm4, 16+" #dst "		\n\t" /* C0, c0 */\
-	"psrad $" #shift ", %%mm7		\n\t"\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"movq %%mm4, 96+" #dst "		\n\t" /* C0, c0 */\
-	"packssdw %%mm7, %%mm6			\n\t" /* C2, c2, C2, c2 */\
-	"movq %%mm6, 32+" #dst "		\n\t" /* C0, c0 */\
-	"psrad $" #shift ", %%mm0		\n\t"\
-	"movq %%mm6, 80+" #dst "		\n\t" /* C0, c0 */\
-	"psrad $" #shift ", %%mm1		\n\t"\
-	"packssdw %%mm1, %%mm0			\n\t" /* C3, c3, C3, c3 */\
-	"movq %%mm0, 48+" #dst "		\n\t" /* C0, c0 */\
-	"movq %%mm0, 64+" #dst "		\n\t" /* C0, c0 */\
+	"packssdw %%mm2, %%mm0			\n\t" /* A1	a1 */\
+	"movq %%mm0, 16+" #dst "		\n\t"\
+	"movq %%mm0, 96+" #dst "		\n\t"\
+	"movq %%mm4, 112+" #dst "		\n\t"\
+	"movq %%mm0, 32+" #dst "		\n\t"\
+	"movq %%mm4, 48+" #dst "		\n\t"\
+	"movq %%mm4, 64+" #dst "		\n\t"\
+	"movq %%mm0, 80+" #dst "		\n\t"	
 
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-//IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-//IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 
 
 #endif
 
 /*
 Input
- 00 20 02 22 40 60 42 62
- 10 30 12 32 50 70 52 72
- 01 21 03 23 41 61 43 63
+ 00 40 04 44 20 60 24 64
+ 10 30 14 34 50 70 54 74
+ 01 41 03 43 21 61 23 63
  11 31 13 33 51 71 53 73
- 04 24 06 26 44 64 46 66
- 14 34 16 36 54 74 56 76
-...
-*/
-/*
+ 02 42 06 46 22 62 26 66
+ 12 32 16 36 52 72 56 76
+ 05 45 07 47 25 65 27 67
+ 15 35 17 37 55 75 57 77
+  
 Temp
- 00 02 10 12 20 22 30 32
- 40 42 50 52 60 62 70 72
+ 00 04 10 14 20 24 30 34
+ 40 44 50 54 60 64 70 74
  01 03 11 13 21 23 31 33
  41 43 51 53 61 63 71 73
- 04 06 14 16 24 26 34 36
- 44 46 54 56 64 66 74 76
+ 02 06 12 16 22 26 32 36
+ 42 46 52 56 62 66 72 76
  05 07 15 17 25 27 35 37
  45 47 55 57 65 67 75 77
 */
 
-/*
-Output
- 00 10 20 30 40 50 60 70
- 01 11 21 31 41 51 61 71
-...
-*/
-
 "9: \n\t"
 		:: "r" (block), "r" (temp), "r" (coeffs)
 		: "%eax"
 	);
-/*
-idctCol(block, temp);
-idctCol(block+1, temp+2);
-idctCol(block+2, temp+4);
-idctCol(block+3, temp+6);
-idctCol(block+4, temp+8);
-idctCol(block+5, temp+10);
-idctCol(block+6, temp+12);
-idctCol(block+7, temp+14);
-*/
 }
 
 void simple_idct_mmx(int16_t *block)
 {
-	static int imax=0, imin=0;
-	static int omax=0, omin=0;
-	int i, j;
-/*
-	for(i=0; i<64; i++)
-	{
-		if(block[i] > imax)
-		{
-			imax= block[i];
-			printf("Input-Max: %d\n", imax);
-			printf("Input-Min: %d\n", imin);
-			printf("Output-Max: %d\n", omax);
-			printf("Output-Min: %d\n", omin);
-		}
-		if(block[i] < imin)
-		{
-			imin= block[i];
-			printf("Input-Max: %d\n", imax);
-			printf("Input-Min: %d\n", imin);
-			printf("Output-Max: %d\n", omax);
-			printf("Output-Min: %d\n", omin);
-		}
-	}*/
-/*	static int stat[64];
-	for(j=0; j<4; j++)
-	{
-		static int line[8]={0,2,1,3,4,6,5,7};
-		for(i=0; i<16; i++)
-		{
-			if(block[j*16+i])
-			{
-				stat[j*16+1]++;
-				break;
-			}
-		}
-		for(i=0; i<16; i++)
-		{
-			if(block[j*16+i] && i!=0 && i!=2)
-			{
-				stat[j*16+2]++;
-				break;
-			}
-		}
-	}
-	stat[0]++;*/
-/*	for(i=1; i<8; i++)
-	{
-		if(block[i] != 0)
-		{
-			stat[1]++;
-			break;
-		}
-	}
-	for(i=32; i<64; i++)
-	{
-		if(block[i] != 0)
-		{
-			stat[2]++;
-			break;
-		}
-	}
-	stat[0]++;
-*/
-//	return;
 	idct(block);
-//	memset(block, 0, 128);
-/*
-	if(stat[0] > 100000)
-		for(i=0; i<64; i++)
-		{
-			if((i&7) == 0) printf("\n");
-			printf("%06d ", stat[i]);
-		}
-*/
-/*
-	for(i=0; i<4; i++) printf("%d", stat[1+i*16]);
-	printf("  ");
-	for(i=0; i<4; i++) printf("%d", stat[2+i*16]);
-	printf("\n");
-*/
-//	printf("%d", stat[2]);
-
-//	memset(stat, 0, 256);
-
-/*
-	for(i=0; i<64; i++)
-	{
-		if(block[i] > omax)
-		{
-			omax= block[i];
-			printf("Input-Max: %d\n", imax);
-			printf("Input-Min: %d\n", imin);
-			printf("Output-Max: %d\n", omax);
-			printf("Output-Min: %d\n", omin);
-		}
-		if(block[i] < omin)
-		{
-			omin= block[i];
-			printf("Input-Max: %d\n", imax);
-			printf("Input-Min: %d\n", imin);
-			printf("Output-Max: %d\n", omax);
-			printf("Output-Min: %d\n", omin);
-		}
-	}*/
 }