changeset 28853:e3f3a991ba81

Remove internal NuppelVideo decoder, the code in libavcodec can decode those files and some more and is far more maintainable.
author reimar
date Sun, 08 Mar 2009 13:32:42 +0000
parents 1f9343ea37df
children 96c6451478af
files Makefile etc/codecs.conf libmpcodecs/native/nuppelvideo.c libmpcodecs/native/rtjpegn.c libmpcodecs/native/rtjpegn.h libmpcodecs/vd.c libmpcodecs/vd_nuv.c
diffstat 7 files changed, 0 insertions(+), 1562 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Sun Mar 08 13:21:00 2009 +0000
+++ b/Makefile	Sun Mar 08 13:32:42 2009 +0000
@@ -92,7 +92,6 @@
               libmpcodecs/dec_video.c \
               libmpcodecs/img_format.c \
               libmpcodecs/mp_image.c \
-              libmpcodecs/native/nuppelvideo.c \
               libmpcodecs/native/rtjpegn.c \
               libmpcodecs/native/xa_gsm.c \
               libmpcodecs/pullup.c \
@@ -102,7 +101,6 @@
               libmpcodecs/vd_mpegpes.c \
               libmpcodecs/vd_mtga.c \
               libmpcodecs/vd_null.c \
-              libmpcodecs/vd_nuv.c \
               libmpcodecs/vd_raw.c \
               libmpcodecs/vd_sgi.c \
               libmpcodecs/vf.c \
--- a/etc/codecs.conf	Sun Mar 08 13:21:00 2009 +0000
+++ b/etc/codecs.conf	Sun Mar 08 13:32:42 2009 +0000
@@ -319,14 +319,6 @@
   dll nuv
   out I420
 
-videocodec nuv
-  info "NuppelVideo"
-  status working
-  fourcc NUV1 ; NUV1 is an internal MPlayer FOURCC
-  fourcc RJPG
-  driver nuv
-  out I420,IYUV
-
 videocodec ffbmp
   info "FFmpeg BMP"
   status working
--- a/libmpcodecs/native/nuppelvideo.c	Sun Mar 08 13:21:00 2009 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,114 +0,0 @@
-/*
- * NuppelVideo 0.05 file parser
- * for MPlayer
- * by Panagiotis Issaris <takis@lumumba.luc.ac.be>
- *
- * Reworked by alex
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-
-#include "config.h"
-#include "mp_msg.h"
-#include "mpbswap.h"
-
-#include "libvo/fastmemcpy.h"
-
-#include "libmpdemux/nuppelvideo.h" 
-#include "rtjpegn.h"
-#include "libavutil/lzo.h"
-
-#define KEEP_BUFFER
-
-void decode_nuv( unsigned char *encoded, int encoded_size,
-		unsigned char *decoded, int width, int height)
-{
-	int r;
-	unsigned int out_len = width * height + ( width * height ) / 2;
-	struct rtframeheader *encodedh = ( struct rtframeheader* ) encoded;
-	static unsigned char *buffer = 0; /* for RTJpeg with LZO decompress */
-#ifdef KEEP_BUFFER
-	static unsigned char *previous_buffer = 0; /* to support Last-frame-copy */
-#endif
-
-//	printf("frametype: %c, comtype: %c, encoded_size: %d, width: %d, height: %d\n",
-//	    encodedh->frametype, encodedh->comptype, encoded_size, width, height);
-
-	le2me_rtframeheader(encodedh);
-	switch(encodedh->frametype)
-	{
-	    case 'D':	/* additional data for compressors */
-	    {
-		/* tables are in encoded */
-		if (encodedh->comptype == 'R')
-		{
-		    RTjpeg_init_decompress ( (unsigned long *)(encoded+12), width, height );
-		    mp_msg(MSGT_DECVIDEO, MSGL_V, "Found RTjpeg tables (size: %d, width: %d, height: %d)\n",
-			encoded_size-12, width, height);
-		}
-		break;
-	    }
-	    case 'V':
-	    {
-		int in_len = encodedh->packetlength;
-#ifdef KEEP_BUFFER		
-		if (!previous_buffer) 
-			previous_buffer = ( unsigned char * ) malloc ( out_len + AV_LZO_OUTPUT_PADDING );
-#endif
-
-		switch(encodedh->comptype)
-		{
-		    case '0': /* raw YUV420 */
-			fast_memcpy(decoded, encoded + 12, out_len);
-			break;
-		    case '1': /* RTJpeg */
-			RTjpeg_decompressYUV420 ( ( __s8 * ) encoded + 12, decoded );
-			break;
-		    case '2': /* RTJpeg with LZO */
-			if (!buffer) 
-			    buffer = ( unsigned char * ) malloc ( out_len + AV_LZO_OUTPUT_PADDING );
-			if (!buffer)
-			{
-			    mp_msg(MSGT_DECVIDEO, MSGL_ERR, "Nuppelvideo: error decompressing\n");
-			    break;
-			}
-			r = av_lzo1x_decode ( buffer, &out_len, encoded + 12, &in_len );
-			if ( r ) 
-			{
-			    mp_msg(MSGT_DECVIDEO, MSGL_ERR, "Nuppelvideo: error decompressing\n");
-			    break;
-			}
-			RTjpeg_decompressYUV420 ( ( __s8 * ) buffer, decoded );
-			break;
-		    case '3': /* raw YUV420 with LZO */
-			r = av_lzo1x_decode ( decoded, &out_len, encoded + 12, &in_len );
-			if ( r ) 
-			{
-			    mp_msg(MSGT_DECVIDEO, MSGL_ERR, "Nuppelvideo: error decompressing\n");
-			    break;
-			}
-			break;
-		    case 'N': /* black frame */
-			memset ( decoded, 0,  width * height );
-			memset ( decoded + width * height, 127, width * height / 2);
-			break;
-		    case 'L': /* copy last frame */
-#ifdef KEEP_BUFFER
-			fast_memcpy ( decoded, previous_buffer, width*height*3/2);
-#endif
-			break;
-		}
-
-#ifdef KEEP_BUFFER
-		fast_memcpy(previous_buffer, decoded, width*height*3/2);
-#endif
-		break;
-	    }
-	    default:
-		mp_msg(MSGT_DECVIDEO, MSGL_V, "Nuppelvideo: unknwon frametype: %c\n",
-		    encodedh->frametype);
-	}
-}
--- a/libmpcodecs/native/rtjpegn.c	Sun Mar 08 13:21:00 2009 +0000
+++ b/libmpcodecs/native/rtjpegn.c	Sun Mar 08 13:32:42 2009 +0000
@@ -293,152 +293,6 @@
  return (int)co;
 }
 
-/* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
-/* Stream to Block  (decoding)                        */
-/*                                                    */
-
-static int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl)
-{
- int ci;
- register int co;
- register int i;
- register unsigned char bitten;
- register unsigned char bitoff;
-
- /* first byte always read */
- i=RTjpeg_ZZ[0];
- data[i]=((__u8)strm[0])*qtbl[i];
-
- /* we start at the behind */ 
-
- bitten = ((unsigned char)strm[1]) >> 2;
- co = 63;
- for(; co > bitten; co--) {
-
-   data[RTjpeg_ZZ[co]] = 0;
-
- }
-
- if (co==0) {
-   ci = 2;
-   goto AUTOBAHN;
- }
-
- /* we have to read the last 2 bits of the second byte */
- ci=1;
- bitoff = 0;
-
- for(; co>0; co--) {
-
-  bitten  = ((unsigned char)strm[ci]) >> bitoff;
-  bitten &= 0x03;
-
-  i=RTjpeg_ZZ[co];
-
-  switch( bitten ) {
-  case 0x03:
-    data[i]= -qtbl[i];
-    break;
-  case 0x02:
-    goto FUSSWEG;
-    break;
-  case 0x01:
-    data[i]= qtbl[i];
-    break;
-  case 0x00:
-    data[i]= 0;
-    break;
-  default: 
-    break;
-  } 
-
-  if( bitoff == 0 ) {
-    bitoff = 8;
-    ci++;
-  }
-  bitoff -= 2;
- }
- /* co is 0 now */
- /* data is written properly */
-
- /* if bitoff!=6 then ci is the index, but should be the byte count, so we increment by 1 */
- if (bitoff!=6) ci++;
-
- goto AUTOBAHN;
- 
- 
-FUSSWEG:
-/* correct bitoff to nibble */
- switch(bitoff){
- case 4:
- case 6:
-   bitoff = 0;
-   break;
- case 2:
- case 0:
-   /* we have to read from the next byte */
-   ci++;
-   bitoff = 4;
-   break;
- default:
-   break;
- }
-
- for(; co>0; co--) {
-
-  bitten  = ((unsigned char)strm[ci]) >> bitoff;
-  bitten &= 0x0f;
-
-  i=RTjpeg_ZZ[co];
-
-  if( bitten == 0x08 ) {
-    goto STRASSE;
-  }
-
-  /* the compiler cannot do sign extension for signed nibbles */
-  if( bitten & 0x08 ) {
-    bitten |= 0xf0;
-  }
-  /* the unsigned char bitten now is a valid signed char */
-   
-  data[i]=((signed char)bitten)*qtbl[i];
-  
-  if( bitoff == 0 ) {
-    bitoff = 8;
-    ci++;
-  }
-  bitoff -= 4;
- }
- /* co is 0 */
-
- /* if bitoff!=4 then ci is the index, but should be the byte count, so we increment by 1 */
- if (bitoff!=4) ci++;
-
- goto AUTOBAHN;
-
-STRASSE:
-  ci++; 
- 
- for(; co>0; co--) {
-  i=RTjpeg_ZZ[co];
-  data[i]=strm[ci++]*qtbl[i];
- }
-
- /* ci now is the count, because it points to next element => no incrementing */
-
-AUTOBAHN:
-
-#ifdef SHOWBLOCK
-fprintf(stdout, "\nci = '%d'\n", ci);
- for (i=0; i < 64; i++) {
-   fprintf(stdout, "%d ", data[RTjpeg_ZZ[i]]);
- }
-fprintf(stdout, "\n\n");
-#endif
-
- return ci;
-}
-
 #else
 
 static int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8)
@@ -1536,1148 +1390,6 @@
 #endif
 }
 
-#define FIX_1_082392200  ((__s32)  277)		/* FIX(1.082392200) */
-#define FIX_1_414213562  ((__s32)  362)		/* FIX(1.414213562) */
-#define FIX_1_847759065  ((__s32)  473)		/* FIX(1.847759065) */
-#define FIX_2_613125930  ((__s32)  669)		/* FIX(2.613125930) */
-
-#define DESCALE(x) (__s16)( ((x)+4) >> 3)
-
-/* clip yuv to 16..235 (should be 16..240 for cr/cb but ... */
-
-#define RL(x) ((x)>235) ? 235 : (((x)<16) ? 16 : (x))
-#define MULTIPLY(var,const)  (((__s32) ((var) * (const)) + 128)>>8)
-
-static void RTjpeg_idct_init(void)
-{
- int i;
- 
- for(i=0; i<64; i++)
- {
-  RTjpeg_liqt[i]=((__u64)RTjpeg_liqt[i]*RTjpeg_aan_tab[i])>>32;
-  RTjpeg_ciqt[i]=((__u64)RTjpeg_ciqt[i]*RTjpeg_aan_tab[i])>>32;
- }
-}
-
-static void RTjpeg_idct(__u8 *odata, __s16 *data, int rskip)
-{
-#if HAVE_MMX
-
-static mmx_t fix_141			= {0x5a825a825a825a82LL};
-static mmx_t fix_184n261	= {0xcf04cf04cf04cf04LL};
-static mmx_t fix_184			= {0x7641764176417641LL};
-static mmx_t fix_n184		= {0x896f896f896f896fLL};
-static mmx_t fix_108n184	= {0xcf04cf04cf04cf04LL};
-
-  mmx_t workspace[64];
-  mmx_t *wsptr = workspace;
-  register mmx_t *dataptr = (mmx_t *)odata;
-  mmx_t *idata = (mmx_t *)data;
-
-  rskip = rskip>>3;
-/*
- * Perform inverse DCT on one block of coefficients.
- */
-
-    /* Odd part */
-
-	movq_m2r(*(idata+10), mm1);	// load idata[DCTSIZE*5]
-
-	movq_m2r(*(idata+6), mm0);		// load idata[DCTSIZE*3]
-
-	movq_m2r(*(idata+2), mm3);		// load idata[DCTSIZE*1]
-
-	movq_r2r(mm1, mm2);				// copy tmp6	/* phase 6 */
-
-	movq_m2r(*(idata+14), mm4);	// load idata[DCTSIZE*7]
-
-	paddw_r2r(mm0, mm1);				// z13 = tmp6 + tmp5;
-
-	psubw_r2r(mm0, mm2);				// z10 = tmp6 - tmp5   
-
-	psllw_i2r(2, mm2);				// shift z10
-	movq_r2r(mm2, mm0); 				// copy z10
-
-	pmulhw_m2r(fix_184n261, mm2);	// MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
-	movq_r2r(mm3, mm5);				// copy tmp4
-
-	pmulhw_m2r(fix_n184, mm0);		// MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
-	paddw_r2r(mm4, mm3);				// z11 = tmp4 + tmp7;
-
-	movq_r2r(mm3, mm6);				// copy z11			/* phase 5 */
-	psubw_r2r(mm4, mm5);				// z12 = tmp4 - tmp7;
-
-	psubw_r2r(mm1, mm6);				// z11-z13
-	psllw_i2r(2, mm5);				//	shift z12
-
-	movq_m2r(*(idata+12), mm4);	// load idata[DCTSIZE*6], even part
- 	movq_r2r(mm5, mm7);				//	copy z12
-
-	pmulhw_m2r(fix_108n184, mm5); //	MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
-	paddw_r2r(mm1, mm3);				// tmp7 = z11 + z13;	
-
-	//ok
-
-    /* Even part */
-	pmulhw_m2r(fix_184, mm7);		// MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
-	psllw_i2r(2, mm6);
-
-	movq_m2r(*(idata+4), mm1);		// load idata[DCTSIZE*2]
-
-	paddw_r2r(mm5, mm0);				//	tmp10
-
-	paddw_r2r(mm7, mm2);				// tmp12
-
-	pmulhw_m2r(fix_141, mm6);		// tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
-	psubw_r2r(mm3, mm2);				// tmp6 = tmp12 - tmp7
-
-	movq_r2r(mm1, mm5);				// copy tmp1
-	paddw_r2r(mm4, mm1);				// tmp13= tmp1 + tmp3;	/* phases 5-3 */
-
-	psubw_r2r(mm4, mm5);				// tmp1-tmp3
-	psubw_r2r(mm2, mm6);				// tmp5 = tmp11 - tmp6;
-
-	movq_r2m(mm1, *(wsptr));		// save tmp13 in workspace
-	psllw_i2r(2, mm5);	// shift tmp1-tmp3
-    
-	movq_m2r(*(idata), mm7); 		// load idata[DCTSIZE*0]
-
-	pmulhw_m2r(fix_141, mm5);		// MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
-	paddw_r2r(mm6, mm0);				// tmp4 = tmp10 + tmp5;
-
-	movq_m2r(*(idata+8), mm4); 	// load idata[DCTSIZE*4]
-	
-	psubw_r2r(mm1, mm5);				// tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
-
-	movq_r2m(mm0, *(wsptr+4));		// save tmp4 in workspace
-	movq_r2r(mm7, mm1);			 	// copy tmp0	/* phase 3 */
-
-	movq_r2m(mm5, *(wsptr+2));		// save tmp12 in workspace
-	psubw_r2r(mm4, mm1);				// tmp11 = tmp0 - tmp2; 
-
-	paddw_r2r(mm4, mm7);				// tmp10 = tmp0 + tmp2;
-   movq_r2r(mm1, mm5);				// copy tmp11
-	
-	paddw_m2r(*(wsptr+2), mm1);	// tmp1 = tmp11 + tmp12;
-	movq_r2r(mm7, mm4);				// copy tmp10		/* phase 2 */
-
-	paddw_m2r(*(wsptr), mm7);		// tmp0 = tmp10 + tmp13;	
-
-	psubw_m2r(*(wsptr), mm4);		// tmp3 = tmp10 - tmp13;
-	movq_r2r(mm7, mm0);				//	copy tmp0
-
-	psubw_m2r(*(wsptr+2), mm5);	// tmp2 = tmp11 - tmp12;
-	paddw_r2r(mm3, mm7);				//	wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
-	
-	psubw_r2r(mm3, mm0);				// wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
-
-	movq_r2m(mm7, *(wsptr));		//	wsptr[DCTSIZE*0]
-	movq_r2r(mm1, mm3);				//	copy tmp1
-
-	movq_r2m(mm0, *(wsptr+14));		// wsptr[DCTSIZE*7]
-	paddw_r2r(mm2, mm1);				// wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
-
-	psubw_r2r(mm2, mm3);				// wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
-
-	movq_r2m(mm1, *(wsptr+2));		// wsptr[DCTSIZE*1]
-	movq_r2r(mm4, mm1);				//	copy tmp3
-
-	movq_r2m(mm3, *(wsptr+12));		// wsptr[DCTSIZE*6]
-
-	paddw_m2r(*(wsptr+4), mm4);	// wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
-
-	psubw_m2r(*(wsptr+4), mm1); 	// wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
-
-	movq_r2m(mm4, *(wsptr+8));		
-	movq_r2r(mm5, mm7);				// copy tmp2
-
-	paddw_r2r(mm6, mm5);				// wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
-
-	movq_r2m(mm1, *(wsptr+6));	
-	psubw_r2r(mm6, mm7);				//	wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
-
-	movq_r2m(mm5, *(wsptr+4));	
-
-	movq_r2m(mm7, *(wsptr+10));		
-
-	//ok
-
-
-/*****************************************************************/
-
-	idata++;
-	wsptr++;
-
-/*****************************************************************/
-
-	movq_m2r(*(idata+10), mm1);	// load idata[DCTSIZE*5]
-
-	movq_m2r(*(idata+6), mm0);		// load idata[DCTSIZE*3]
-
-	movq_m2r(*(idata+2),	mm3);		// load idata[DCTSIZE*1]
-	movq_r2r(mm1, mm2);				//	copy tmp6	/* phase 6 */
-
-	movq_m2r(*(idata+14),	mm4);		// load idata[DCTSIZE*7]
-	paddw_r2r(mm0, mm1);				//	z13 = tmp6 + tmp5;
-
-	psubw_r2r(mm0, mm2);				//	z10 = tmp6 - tmp5   
-
-	psllw_i2r(2, mm2);				//	shift z10
-	movq_r2r(mm2, mm0);				//	copy z10
-
-	pmulhw_m2r(fix_184n261, mm2);	// MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
-	movq_r2r(mm3, mm5);				//	copy tmp4
-
-	pmulhw_m2r(fix_n184, mm0);		// MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
-	paddw_r2r(mm4, mm3);				// z11 = tmp4 + tmp7;
-
-	movq_r2r(mm3, mm6);				// copy z11			/* phase 5 */
-	psubw_r2r(mm4, mm5);				//	z12 = tmp4 - tmp7;
-
-	psubw_r2r(mm1, mm6);				// z11-z13
-	psllw_i2r(2, mm5);				//	shift z12
-
-	movq_m2r(*(idata+12), mm4);	// load idata[DCTSIZE*6], even part
- 	movq_r2r(mm5, mm7);				// copy z12
-
-	pmulhw_m2r(fix_108n184, mm5);	// MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
-	paddw_r2r(mm1, mm3);				// tmp7 = z11 + z13;	
-
-	//ok
-
-    /* Even part */
-	pmulhw_m2r(fix_184, mm7);		// MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
-	psllw_i2r(2, mm6);
-
-	movq_m2r(*(idata+4), mm1);		// load idata[DCTSIZE*2]
-
-	paddw_r2r(mm5, mm0);				//	tmp10
-
-	paddw_r2r(mm7, mm2);				// tmp12
-
-	pmulhw_m2r(fix_141, mm6);		// tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
-	psubw_r2r(mm3, mm2);				// tmp6 = tmp12 - tmp7
-
-	movq_r2r(mm1, mm5);				// copy tmp1
-	paddw_r2r(mm4, mm1);				// tmp13= tmp1 + tmp3;	/* phases 5-3 */
-
-	psubw_r2r(mm4, mm5);				// tmp1-tmp3
-	psubw_r2r(mm2, mm6);				// tmp5 = tmp11 - tmp6;
-
-	movq_r2m(mm1, *(wsptr));		// save tmp13 in workspace
-	psllw_i2r(2, mm5); 				// shift tmp1-tmp3
-    
-	movq_m2r(*(idata), mm7);		// load idata[DCTSIZE*0]
-	paddw_r2r(mm6, mm0);				// tmp4 = tmp10 + tmp5;
-
-	pmulhw_m2r(fix_141, mm5);		// MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
-
-	movq_m2r(*(idata+8), mm4);    // load idata[DCTSIZE*4]
-	
-	psubw_r2r(mm1, mm5);				// tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
-
-	movq_r2m(mm0, *(wsptr+4));		// save tmp4 in workspace
-	movq_r2r(mm7, mm1);				// copy tmp0	/* phase 3 */
-
-	movq_r2m(mm5, *(wsptr+2));		// save tmp12 in workspace
-	psubw_r2r(mm4, mm1);				// tmp11 = tmp0 - tmp2; 
-
-	paddw_r2r(mm4, mm7);				// tmp10 = tmp0 + tmp2;
-   movq_r2r(mm1, mm5);				// copy tmp11
-	
-	paddw_m2r(*(wsptr+2), mm1);	// tmp1 = tmp11 + tmp12;
-	movq_r2r(mm7, mm4);				// copy tmp10		/* phase 2 */
-
-	paddw_m2r(*(wsptr), mm7);		// tmp0 = tmp10 + tmp13;	
-
-	psubw_m2r(*(wsptr), mm4);		// tmp3 = tmp10 - tmp13;
-	movq_r2r(mm7, mm0);				// copy tmp0
-
-	psubw_m2r(*(wsptr+2), mm5);	// tmp2 = tmp11 - tmp12;
-	paddw_r2r(mm3, mm7);				// wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
-	
-	psubw_r2r(mm3, mm0);				// wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
-
-	movq_r2m(mm7, *(wsptr));		// wsptr[DCTSIZE*0]
-	movq_r2r(mm1, mm3);				// copy tmp1
-
-	movq_r2m(mm0, *(wsptr+14));		// wsptr[DCTSIZE*7]
-	paddw_r2r(mm2, mm1);				// wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
-
-	psubw_r2r(mm2, mm3);				// wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
-
-	movq_r2m(mm1, *(wsptr+2));		// wsptr[DCTSIZE*1]
-	movq_r2r(mm4, mm1);				// copy tmp3
-
-	movq_r2m(mm3, *(wsptr+12));		// wsptr[DCTSIZE*6]
-
-	paddw_m2r(*(wsptr+4), mm4);	// wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
-
-	psubw_m2r(*(wsptr+4), mm1);	// wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
-
-	movq_r2m(mm4, *(wsptr+8));		
-	movq_r2r(mm5, mm7);				// copy tmp2
-
-	paddw_r2r(mm6, mm5);				// wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
-
-	movq_r2m(mm1, *(wsptr+6));		
-	psubw_r2r(mm6, mm7);				// wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
-
-	movq_r2m(mm5, *(wsptr+4));	
-
-	movq_r2m(mm7, *(wsptr+10));
-
-/*****************************************************************/
-
-  /* Pass 2: process rows from work array, store into output array. */
-  /* Note that we must descale the results by a factor of 8 == 2**3, */
-  /* and also undo the PASS1_BITS scaling. */
-
-/*****************************************************************/
-    /* Even part */
-
-	wsptr--;
-
-//    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
-//    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
-//    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
-//    tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
-	movq_m2r(*(wsptr), mm0);		// wsptr[0,0],[0,1],[0,2],[0,3]
-
-	movq_m2r(*(wsptr+1),	mm1);		// wsptr[0,4],[0,5],[0,6],[0,7]
-	movq_r2r(mm0, mm2);
-	
-	movq_m2r(*(wsptr+2), mm3);		// wsptr[1,0],[1,1],[1,2],[1,3]
-	paddw_r2r(mm1, mm0);				// wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
-
-	movq_m2r(*(wsptr+3), mm4);		// wsptr[1,4],[1,5],[1,6],[1,7]
-	psubw_r2r(mm1, mm2);				// wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
-
-	movq_r2r(mm0, mm6);
-	movq_r2r(mm3, mm5);
-	
-	paddw_r2r(mm4, mm3);				// wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
-	movq_r2r(mm2, mm1);
-
-	psubw_r2r(mm4, mm5);				// wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
-	punpcklwd_r2r(mm3, mm0);		// wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
-
-	movq_m2r(*(wsptr+7), mm7);		// wsptr[3,4],[3,5],[3,6],[3,7]
-	punpckhwd_r2r(mm3, mm6);		// wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
-
-	movq_m2r(*(wsptr+4), mm3);		// wsptr[2,0],[2,1],[2,2],[2,3]
-	punpckldq_r2r(mm6, mm0);		// wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
-
-	punpcklwd_r2r(mm5, mm1);		// wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
-	movq_r2r(mm3, mm4);
-
-	movq_m2r(*(wsptr+6), mm6);		// wsptr[3,0],[3,1],[3,2],[3,3]
-	punpckhwd_r2r(mm5, mm2);		// wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
-
-	movq_m2r(*(wsptr+5), mm5);		// wsptr[2,4],[2,5],[2,6],[2,7]
-	punpckldq_r2r(mm2, mm1);		// wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
-
-	
-	paddw_r2r(mm5, mm3);				// wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
-	movq_r2r(mm6, mm2);
-
-	psubw_r2r(mm5, mm4);				// wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
-	paddw_r2r(mm7, mm6);				// wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
-
-	movq_r2r(mm3, mm5);
-	punpcklwd_r2r(mm6, mm3);		// wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
-	
-	psubw_r2r(mm7, mm2);				// wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
-	punpckhwd_r2r(mm6, mm5);		// wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
-
-	movq_r2r(mm4, mm7);
-	punpckldq_r2r(mm5, mm3);		// wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
-						 
-	punpcklwd_r2r(mm2, mm4);		// wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
-
-	punpckhwd_r2r(mm2, mm7);		// wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
-
-	punpckldq_r2r(mm7, mm4);		// wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
-	movq_r2r(mm1, mm6);
-
-	//ok
-
-//	mm0 = 	;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
-//	mm1 =	;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
-
-
-	movq_r2r(mm0, mm2);
-	punpckhdq_r2r(mm4, mm6);		// wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
-
-	punpckldq_r2r(mm4, mm1);		// wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
-	psllw_i2r(2, mm6);
-
-	pmulhw_m2r(fix_141, mm6);
-	punpckldq_r2r(mm3, mm0);		// wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
-
-	punpckhdq_r2r(mm3, mm2);		// wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
-	movq_r2r(mm0, mm7);
-
-//    tmp0 = tmp10 + tmp13;
-//    tmp3 = tmp10 - tmp13;
-	paddw_r2r(mm2, mm0);				// [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
-	psubw_r2r(mm2, mm7);				// [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
-
-//    tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
-	psubw_r2r(mm2, mm6);				// wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
-//    tmp1 = tmp11 + tmp12;
-//    tmp2 = tmp11 - tmp12;
-	movq_r2r(mm1, mm5);
-
-	//OK
-
-    /* Odd part */
-
-//    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
-//    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
-//    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
-//    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
-	movq_m2r(*(wsptr), mm3);		// wsptr[0,0],[0,1],[0,2],[0,3]
-	paddw_r2r(mm6, mm1);				// [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
-
-	movq_m2r(*(wsptr+1), mm4);		// wsptr[0,4],[0,5],[0,6],[0,7]
-	psubw_r2r(mm6, mm5);				// [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
-
-	movq_r2r(mm3, mm6);
-	punpckldq_r2r(mm4, mm3);		// wsptr[0,0],[0,1],[0,4],[0,5]
-
-	punpckhdq_r2r(mm6, mm4);		// wsptr[0,6],[0,7],[0,2],[0,3]
-	movq_r2r(mm3, mm2);
-
-//Save tmp0 and tmp1 in wsptr
-	movq_r2m(mm0, *(wsptr));		// save tmp0
-	paddw_r2r(mm4, mm2);				// wsptr[xxx],[0,z11],[xxx],[0,z13]
-
-	
-//Continue with z10 --- z13
-	movq_m2r(*(wsptr+2), mm6);		// wsptr[1,0],[1,1],[1,2],[1,3]
-	psubw_r2r(mm4, mm3);				// wsptr[xxx],[0,z12],[xxx],[0,z10]
-
-	movq_m2r(*(wsptr+3), mm0);		// wsptr[1,4],[1,5],[1,6],[1,7]
-	movq_r2r(mm6, mm4);
-
-	movq_r2m(mm1, *(wsptr+1));		// save tmp1
-	punpckldq_r2r(mm0, mm6);		// wsptr[1,0],[1,1],[1,4],[1,5]
-
-	punpckhdq_r2r(mm4, mm0);		// wsptr[1,6],[1,7],[1,2],[1,3]
-	movq_r2r(mm6, mm1);
-	
-//Save tmp2 and tmp3 in wsptr
-	paddw_r2r(mm0, mm6);				// wsptr[xxx],[1,z11],[xxx],[1,z13]
-	movq_r2r(mm2, mm4);
-	
-//Continue with z10 --- z13
-	movq_r2m(mm5, *(wsptr+2));		// save tmp2
-	punpcklwd_r2r(mm6, mm2);		// wsptr[xxx],[xxx],[0,z11],[1,z11]
-
-	psubw_r2r(mm0, mm1);				// wsptr[xxx],[1,z12],[xxx],[1,z10]
-	punpckhwd_r2r(mm6, mm4);		// wsptr[xxx],[xxx],[0,z13],[1,z13]
-
-	movq_r2r(mm3, mm0);
-	punpcklwd_r2r(mm1, mm3);		// wsptr[xxx],[xxx],[0,z12],[1,z12]
-
-	movq_r2m(mm7, *(wsptr+3));		// save tmp3
-	punpckhwd_r2r(mm1, mm0);		// wsptr[xxx],[xxx],[0,z10],[1,z10]
-
-	movq_m2r(*(wsptr+4), mm6);		// wsptr[2,0],[2,1],[2,2],[2,3]
-	punpckhdq_r2r(mm2, mm0);		// wsptr[0,z10],[1,z10],[0,z11],[1,z11]
-
-	movq_m2r(*(wsptr+5), mm7);	// wsptr[2,4],[2,5],[2,6],[2,7]
-	punpckhdq_r2r(mm4, mm3);		// wsptr[0,z12],[1,z12],[0,z13],[1,z13]
-
-	movq_m2r(*(wsptr+6), mm1);	// wsptr[3,0],[3,1],[3,2],[3,3]
-	movq_r2r(mm6, mm4);
-
-	punpckldq_r2r(mm7, mm6);		// wsptr[2,0],[2,1],[2,4],[2,5]
-	movq_r2r(mm1, mm5);
-
-	punpckhdq_r2r(mm4, mm7);		// wsptr[2,6],[2,7],[2,2],[2,3]
-	movq_r2r(mm6, mm2);
-	
-	movq_m2r(*(wsptr+7), mm4);	// wsptr[3,4],[3,5],[3,6],[3,7]
-	paddw_r2r(mm7, mm6);				// wsptr[xxx],[2,z11],[xxx],[2,z13]
-
-	psubw_r2r(mm7, mm2);				// wsptr[xxx],[2,z12],[xxx],[2,z10]
-	punpckldq_r2r(mm4, mm1);		// wsptr[3,0],[3,1],[3,4],[3,5]
-
-	punpckhdq_r2r(mm5, mm4);		// wsptr[3,6],[3,7],[3,2],[3,3]
-	movq_r2r(mm1, mm7);
-
-	paddw_r2r(mm4, mm1);				// wsptr[xxx],[3,z11],[xxx],[3,z13]
-	psubw_r2r(mm4, mm7);				// wsptr[xxx],[3,z12],[xxx],[3,z10]
-
-	movq_r2r(mm6, mm5);
-	punpcklwd_r2r(mm1, mm6);		// wsptr[xxx],[xxx],[2,z11],[3,z11]
-
-	punpckhwd_r2r(mm1, mm5);		// wsptr[xxx],[xxx],[2,z13],[3,z13]
-	movq_r2r(mm2, mm4);
-
-	punpcklwd_r2r(mm7, mm2);		// wsptr[xxx],[xxx],[2,z12],[3,z12]
-
-	punpckhwd_r2r(mm7, mm4);		// wsptr[xxx],[xxx],[2,z10],[3,z10]
-
-	punpckhdq_r2r(mm6, mm4);		/// wsptr[2,z10],[3,z10],[2,z11],[3,z11]
-
-	punpckhdq_r2r(mm5, mm2);		// wsptr[2,z12],[3,z12],[2,z13],[3,z13]
-	movq_r2r(mm0, mm5);
-
-	punpckldq_r2r(mm4, mm0);		// wsptr[0,z10],[1,z10],[2,z10],[3,z10]
-
-	punpckhdq_r2r(mm4, mm5);		// wsptr[0,z11],[1,z11],[2,z11],[3,z11]
-	movq_r2r(mm3, mm4);
-
-	punpckhdq_r2r(mm2, mm4);		// wsptr[0,z13],[1,z13],[2,z13],[3,z13]
-	movq_r2r(mm5, mm1);
-
-	punpckldq_r2r(mm2, mm3);		// wsptr[0,z12],[1,z12],[2,z12],[3,z12]
-//    tmp7 = z11 + z13;		/* phase 5 */
-//    tmp8 = z11 - z13;		/* phase 5 */
-	psubw_r2r(mm4, mm1);				// tmp8
-
-	paddw_r2r(mm4, mm5);				// tmp7
-//    tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
-	psllw_i2r(2, mm1);
-
-	psllw_i2r(2, mm0);
-
-	pmulhw_m2r(fix_141, mm1);		// tmp21
-//    tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065))  /* 2*(c2-c6) */
-//			+ MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
-	psllw_i2r(2, mm3);
-	movq_r2r(mm0, mm7);
-
-	pmulhw_m2r(fix_n184, mm7);
-	movq_r2r(mm3, mm6);
-
-	movq_m2r(*(wsptr), mm2);		// tmp0,final1
-
-	pmulhw_m2r(fix_108n184, mm6);
-//	 tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
-//			+ MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
-	movq_r2r(mm2, mm4);				// final1
-  
-	pmulhw_m2r(fix_184n261, mm0);
-	paddw_r2r(mm5, mm2);				// tmp0+tmp7,final1
-
-	pmulhw_m2r(fix_184, mm3);
-	psubw_r2r(mm5, mm4);				// tmp0-tmp7,final1
-
-//    tmp6 = tmp22 - tmp7;	/* phase 2 */
-	psraw_i2r(3, mm2);				// outptr[0,0],[1,0],[2,0],[3,0],final1
-
-	paddw_r2r(mm6, mm7);				// tmp20
-	psraw_i2r(3, mm4);				// outptr[0,7],[1,7],[2,7],[3,7],final1
-
-	paddw_r2r(mm0, mm3);				// tmp22
-
-//    tmp5 = tmp21 - tmp6;
-	psubw_r2r(mm5, mm3);				// tmp6
-
-//    tmp4 = tmp20 + tmp5;
-	movq_m2r(*(wsptr+1), mm0);		// tmp1,final2
-	psubw_r2r(mm3, mm1);				// tmp5
-
-	movq_r2r(mm0, mm6);				// final2
-	paddw_r2r(mm3, mm0);				// tmp1+tmp6,final2
-
-    /* Final output stage: scale down by a factor of 8 and range-limit */
-
-
-//    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
-//			    & RANGE_MASK];
-//    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
-//			    & RANGE_MASK];	final1
-
-
-//    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
-//			    & RANGE_MASK];
-//    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
-//			    & RANGE_MASK];	final2
-	psubw_r2r(mm3, mm6);				// tmp1-tmp6,final2
-	psraw_i2r(3, mm0);				// outptr[0,1],[1,1],[2,1],[3,1]
-
-	psraw_i2r(3, mm6);				// outptr[0,6],[1,6],[2,6],[3,6]
-	
-	packuswb_r2r(mm4, mm0);			// out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
-	
-	movq_m2r(*(wsptr+2), mm5);		// tmp2,final3
-	packuswb_r2r(mm6, mm2);			// out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
-
-//    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
-//			    & RANGE_MASK];
-//    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
-//			    & RANGE_MASK];	final3
-	paddw_r2r(mm1, mm7);				// tmp4
-	movq_r2r(mm5, mm3);
-
-	paddw_r2r(mm1, mm5);				// tmp2+tmp5
-	psubw_r2r(mm1, mm3);				// tmp2-tmp5
-
-	psraw_i2r(3, mm5);				// outptr[0,2],[1,2],[2,2],[3,2]
-
-	movq_m2r(*(wsptr+3), mm4);		// tmp3,final4
-	psraw_i2r(3, mm3);				// outptr[0,5],[1,5],[2,5],[3,5]
-
-
-
-//    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
-//			    & RANGE_MASK];
-//    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
-//			    & RANGE_MASK];	final4
-	movq_r2r(mm4, mm6);
-	paddw_r2r(mm7, mm4);				// tmp3+tmp4
-
-	psubw_r2r(mm7, mm6);				// tmp3-tmp4
-	psraw_i2r(3, mm4);				// outptr[0,4],[1,4],[2,4],[3,4]
-
-	// mov			ecx, [dataptr]
-
-	psraw_i2r(3, mm6);				// outptr[0,3],[1,3],[2,3],[3,3]
-
-	packuswb_r2r(mm4, mm5);			// out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
-
-	packuswb_r2r(mm3, mm6);			// out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
-	movq_r2r(mm2, mm4);
-
-	movq_r2r(mm5, mm7);
-	punpcklbw_r2r(mm0, mm2);		// out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
-
-	punpckhbw_r2r(mm0, mm4);		// out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
-	movq_r2r(mm2, mm1);
-
-	punpcklbw_r2r(mm6, mm5);		// out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
-
-	// add		 	dataptr, 4
-
-	punpckhbw_r2r(mm6, mm7);		// out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
-
-	punpcklwd_r2r(mm5, mm2);		// out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
-	
-	// add			ecx, output_col
-
-	movq_r2r(mm7, mm6);
-	punpckhwd_r2r(mm5, mm1);		// out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
-
-	movq_r2r(mm2, mm0);
-	punpcklwd_r2r(mm4, mm6);		// out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
-
-	// mov			idata, [dataptr]
-	
-	punpckldq_r2r(mm6, mm2);		// out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
-
-	// add		 	dataptr, 4
-	 
-	movq_r2r(mm1, mm3);
-
-	// add			idata, output_col 
-	
-	punpckhwd_r2r(mm4, mm7);		// out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
-	
-	movq_r2m(mm2, *(dataptr));
-	
-	punpckhdq_r2r(mm6, mm0);		// out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
-
-	dataptr += rskip;
-	movq_r2m(mm0, *(dataptr));
-
-	punpckldq_r2r(mm7, mm1);		// out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
-	punpckhdq_r2r(mm7, mm3);		// out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
-	
-	dataptr += rskip;
-	movq_r2m(mm1, *(dataptr));
-
-	dataptr += rskip;
-	movq_r2m(mm3, *(dataptr));
-
-/*******************************************************************/
-
-	wsptr += 8;
-
-/*******************************************************************/
-
-//    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
-//    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
-//    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
-//    tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
-	movq_m2r(*(wsptr), mm0);		// wsptr[0,0],[0,1],[0,2],[0,3]
-
-	movq_m2r(*(wsptr+1), mm1);		// wsptr[0,4],[0,5],[0,6],[0,7]
-	movq_r2r(mm0, mm2);
-	
-	movq_m2r(*(wsptr+2), mm3);		// wsptr[1,0],[1,1],[1,2],[1,3]
-	paddw_r2r(mm1, mm0);				// wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
-
-	movq_m2r(*(wsptr+3), mm4);		// wsptr[1,4],[1,5],[1,6],[1,7]
-	psubw_r2r(mm1, mm2);				// wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
-
-	movq_r2r(mm0, mm6);
-	movq_r2r(mm3, mm5);
-	
-	paddw_r2r(mm4, mm3);				// wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
-	movq_r2r(mm2, mm1);
-
-	psubw_r2r(mm4, mm5);				// wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
-	punpcklwd_r2r(mm3, mm0);		// wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
-
-	movq_m2r(*(wsptr+7), mm7);	// wsptr[3,4],[3,5],[3,6],[3,7]
-	punpckhwd_r2r(mm3, mm6);		// wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
-
-	movq_m2r(*(wsptr+4),	mm3);		// wsptr[2,0],[2,1],[2,2],[2,3]
-	punpckldq_r2r(mm6, mm0);		// wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
-
-	punpcklwd_r2r(mm5, mm1);		// wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
-	movq_r2r(mm3, mm4);
-
-	movq_m2r(*(wsptr+6), mm6);	// wsptr[3,0],[3,1],[3,2],[3,3]
-	punpckhwd_r2r(mm5, mm2);		// wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
-
-	movq_m2r(*(wsptr+5), mm5);	// wsptr[2,4],[2,5],[2,6],[2,7]
-	punpckldq_r2r(mm2, mm1);		// wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
-
-	paddw_r2r(mm5, mm3);				// wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
-	movq_r2r(mm6, mm2);
-
-	psubw_r2r(mm5, mm4);				// wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
-	paddw_r2r(mm7, mm6);				// wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
-
-	movq_r2r(mm3, mm5);
-	punpcklwd_r2r(mm6, mm3);		// wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
-	
-	psubw_r2r(mm7, mm2);				// wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
-	punpckhwd_r2r(mm6, mm5);		// wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
-
-	movq_r2r(mm4, mm7);
-	punpckldq_r2r(mm5, mm3);		// wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
-
-	punpcklwd_r2r(mm2, mm4);		// wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
-
-	punpckhwd_r2r(mm2, mm7);		// wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
-
-	punpckldq_r2r(mm7, mm4);		// wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
-	movq_r2r(mm1, mm6);
-
-	//OK
-
-//	mm0 = 	;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
-//	mm1 =	;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
-
-	movq_r2r(mm0, mm2);
-	punpckhdq_r2r(mm4, mm6);		// wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
-
-	punpckldq_r2r(mm4, mm1);		// wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
-	psllw_i2r(2, mm6);
-
-	pmulhw_m2r(fix_141, mm6);
-	punpckldq_r2r(mm3, mm0);		// wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
-
-	punpckhdq_r2r(mm3, mm2);		// wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
-	movq_r2r(mm0, mm7);
-
-//    tmp0 = tmp10 + tmp13;
-//    tmp3 = tmp10 - tmp13;
-	paddw_r2r(mm2, mm0);				// [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
-	psubw_r2r(mm2, mm7);				// [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
-
-//    tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
-	psubw_r2r(mm2, mm6);				// wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
-//    tmp1 = tmp11 + tmp12;
-//    tmp2 = tmp11 - tmp12;
-	movq_r2r(mm1, mm5);
-
-	 //OK
-
-
-    /* Odd part */
-
-//    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
-//    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
-//    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
-//    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
-	movq_m2r(*(wsptr), mm3);		// wsptr[0,0],[0,1],[0,2],[0,3]
-	paddw_r2r(mm6, mm1);				// [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
-
-	movq_m2r(*(wsptr+1),	mm4);		// wsptr[0,4],[0,5],[0,6],[0,7]
-	psubw_r2r(mm6, mm5);				// [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
-
-	movq_r2r(mm3, mm6);
-	punpckldq_r2r(mm4, mm3);		// wsptr[0,0],[0,1],[0,4],[0,5]
-
-	punpckhdq_r2r(mm6, mm4);		// wsptr[0,6],[0,7],[0,2],[0,3]
-	movq_r2r(mm3, mm2);
-
-//Save tmp0 and tmp1 in wsptr
-	movq_r2m(mm0, *(wsptr));		// save tmp0
-	paddw_r2r(mm4, mm2);				// wsptr[xxx],[0,z11],[xxx],[0,z13]
-
-	
-//Continue with z10 --- z13
-	movq_m2r(*(wsptr+2), mm6);		// wsptr[1,0],[1,1],[1,2],[1,3]
-	psubw_r2r(mm4, mm3);				// wsptr[xxx],[0,z12],[xxx],[0,z10]
-
-	movq_m2r(*(wsptr+3), mm0);		// wsptr[1,4],[1,5],[1,6],[1,7]
-	movq_r2r(mm6, mm4);
-
-	movq_r2m(mm1, *(wsptr+1));		// save tmp1
-	punpckldq_r2r(mm0, mm6);		// wsptr[1,0],[1,1],[1,4],[1,5]
-
-	punpckhdq_r2r(mm4, mm0);		// wsptr[1,6],[1,7],[1,2],[1,3]
-	movq_r2r(mm6, mm1);
-	
-//Save tmp2 and tmp3 in wsptr
-	paddw_r2r(mm0, mm6);				// wsptr[xxx],[1,z11],[xxx],[1,z13]
-	movq_r2r(mm2, mm4);
-	
-//Continue with z10 --- z13
-	movq_r2m(mm5, *(wsptr+2));		// save tmp2
-	punpcklwd_r2r(mm6, mm2);		// wsptr[xxx],[xxx],[0,z11],[1,z11]
-
-	psubw_r2r(mm0, mm1);				// wsptr[xxx],[1,z12],[xxx],[1,z10]
-	punpckhwd_r2r(mm6, mm4);		// wsptr[xxx],[xxx],[0,z13],[1,z13]
-
-	movq_r2r(mm3, mm0);
-	punpcklwd_r2r(mm1, mm3);		// wsptr[xxx],[xxx],[0,z12],[1,z12]
-
-	movq_r2m(mm7, *(wsptr+3));		// save tmp3
-	punpckhwd_r2r(mm1, mm0);		// wsptr[xxx],[xxx],[0,z10],[1,z10]
-
-	movq_m2r(*(wsptr+4), mm6);		// wsptr[2,0],[2,1],[2,2],[2,3]
-	punpckhdq_r2r(mm2, mm0);		// wsptr[0,z10],[1,z10],[0,z11],[1,z11]
-
-	movq_m2r(*(wsptr+5), mm7);	// wsptr[2,4],[2,5],[2,6],[2,7]
-	punpckhdq_r2r(mm4, mm3);		// wsptr[0,z12],[1,z12],[0,z13],[1,z13]
-
-	movq_m2r(*(wsptr+6), mm1);	// wsptr[3,0],[3,1],[3,2],[3,3]
-	movq_r2r(mm6, mm4);
-
-	punpckldq_r2r(mm7, mm6);		// wsptr[2,0],[2,1],[2,4],[2,5]
-	movq_r2r(mm1, mm5);
-
-	punpckhdq_r2r(mm4, mm7);		// wsptr[2,6],[2,7],[2,2],[2,3]
-	movq_r2r(mm6, mm2);
-	
-	movq_m2r(*(wsptr+7), mm4);	// wsptr[3,4],[3,5],[3,6],[3,7]
-	paddw_r2r(mm7, mm6);				// wsptr[xxx],[2,z11],[xxx],[2,z13]
-
-	psubw_r2r(mm7, mm2);				// wsptr[xxx],[2,z12],[xxx],[2,z10]
-	punpckldq_r2r(mm4, mm1);		// wsptr[3,0],[3,1],[3,4],[3,5]
-
-	punpckhdq_r2r(mm5, mm4);		// wsptr[3,6],[3,7],[3,2],[3,3]
-	movq_r2r(mm1, mm7);
-
-	paddw_r2r(mm4, mm1);				// wsptr[xxx],[3,z11],[xxx],[3,z13]
-	psubw_r2r(mm4, mm7);				// wsptr[xxx],[3,z12],[xxx],[3,z10]
-
-	movq_r2r(mm6, mm5);
-	punpcklwd_r2r(mm1, mm6);		// wsptr[xxx],[xxx],[2,z11],[3,z11]
-
-	punpckhwd_r2r(mm1, mm5);		// wsptr[xxx],[xxx],[2,z13],[3,z13]
-	movq_r2r(mm2, mm4);
-
-	punpcklwd_r2r(mm7, mm2);		// wsptr[xxx],[xxx],[2,z12],[3,z12]
-
-	punpckhwd_r2r(mm7, mm4);		// wsptr[xxx],[xxx],[2,z10],[3,z10]
-
-	punpckhdq_r2r(mm6, mm4);		// wsptr[2,z10],[3,z10],[2,z11],[3,z11]
-
-	punpckhdq_r2r(mm5, mm2);		// wsptr[2,z12],[3,z12],[2,z13],[3,z13]
-	movq_r2r(mm0, mm5);
-
-	punpckldq_r2r(mm4, mm0);		// wsptr[0,z10],[1,z10],[2,z10],[3,z10]
-
-	punpckhdq_r2r(mm4, mm5);		// wsptr[0,z11],[1,z11],[2,z11],[3,z11]
-	movq_r2r(mm3, mm4);
-
-	punpckhdq_r2r(mm2, mm4);		// wsptr[0,z13],[1,z13],[2,z13],[3,z13]
-	movq_r2r(mm5, mm1);
-
-	punpckldq_r2r(mm2, mm3);		// wsptr[0,z12],[1,z12],[2,z12],[3,z12]
-//    tmp7 = z11 + z13;		/* phase 5 */
-//    tmp8 = z11 - z13;		/* phase 5 */
-	psubw_r2r(mm4, mm1);				// tmp8
-
-	paddw_r2r(mm4, mm5);				// tmp7
-//    tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
-	psllw_i2r(2, mm1);
-
-	psllw_i2r(2, mm0);
-
-	pmulhw_m2r(fix_141, mm1);		// tmp21
-//    tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065))  /* 2*(c2-c6) */
-//			+ MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
-	psllw_i2r(2, mm3);
-	movq_r2r(mm0, mm7);
-
-	pmulhw_m2r(fix_n184, mm7);
-	movq_r2r(mm3, mm6);
-
-	movq_m2r(*(wsptr), mm2);		// tmp0,final1
-
-	pmulhw_m2r(fix_108n184, mm6);
-//	 tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
-//			+ MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
-	movq_r2r(mm2, mm4);				// final1
-  
-	pmulhw_m2r(fix_184n261, mm0);
-	paddw_r2r(mm5, mm2);				// tmp0+tmp7,final1
-
-	pmulhw_m2r(fix_184, mm3);
-	psubw_r2r(mm5, mm4);				// tmp0-tmp7,final1
-
-//    tmp6 = tmp22 - tmp7;	/* phase 2 */
-	psraw_i2r(3, mm2);				// outptr[0,0],[1,0],[2,0],[3,0],final1
-
-	paddw_r2r(mm6, mm7);				// tmp20
-	psraw_i2r(3, mm4);				// outptr[0,7],[1,7],[2,7],[3,7],final1
-
-	paddw_r2r(mm0, mm3);				// tmp22
-
-//    tmp5 = tmp21 - tmp6;
-	psubw_r2r(mm5, mm3);				// tmp6
-
-//    tmp4 = tmp20 + tmp5;
-	movq_m2r(*(wsptr+1), mm0);		// tmp1,final2
-	psubw_r2r(mm3, mm1);				// tmp5
-
-	movq_r2r(mm0, mm6);				// final2
-	paddw_r2r(mm3, mm0);				// tmp1+tmp6,final2
-
-    /* Final output stage: scale down by a factor of 8 and range-limit */
-
-//    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
-//			    & RANGE_MASK];
-//    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
-//			    & RANGE_MASK];	final1
-
-
-//    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
-//			    & RANGE_MASK];
-//    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
-//			    & RANGE_MASK];	final2
-	psubw_r2r(mm3, mm6);				// tmp1-tmp6,final2
-	psraw_i2r(3, mm0);				// outptr[0,1],[1,1],[2,1],[3,1]
-
-	psraw_i2r(3, mm6);				// outptr[0,6],[1,6],[2,6],[3,6]
-	
-	packuswb_r2r(mm4, mm0);			// out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
-	
-	movq_m2r(*(wsptr+2), mm5);		// tmp2,final3
-	packuswb_r2r(mm6, mm2);			// out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
-
-//    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
-//			    & RANGE_MASK];
-//    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
-//			    & RANGE_MASK];	final3
-	paddw_r2r(mm1, mm7);				// tmp4
-	movq_r2r(mm5, mm3);
-
-	paddw_r2r(mm1, mm5);				// tmp2+tmp5
-	psubw_r2r(mm1, mm3);				// tmp2-tmp5
-
-	psraw_i2r(3, mm5);				// outptr[0,2],[1,2],[2,2],[3,2]
-
-	movq_m2r(*(wsptr+3), mm4);		// tmp3,final4
-	psraw_i2r(3, mm3);				// outptr[0,5],[1,5],[2,5],[3,5]
-
-
-
-//    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
-//			    & RANGE_MASK];
-//    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
-//			    & RANGE_MASK];	final4
-	movq_r2r(mm4, mm6);
-	paddw_r2r(mm7, mm4);				// tmp3+tmp4
-
-	psubw_r2r(mm7, mm6);				// tmp3-tmp4
-	psraw_i2r(3, mm4);				// outptr[0,4],[1,4],[2,4],[3,4]
-
-	psraw_i2r(3, mm6);				// outptr[0,3],[1,3],[2,3],[3,3]
-
-	/*
-   movq_r2m(mm4, *dummy);
-	fprintf(stderr, "3-4 %016llx\n", dummy);
-   movq_r2m(mm4, *dummy);
-	fprintf(stderr, "3+4 %016llx\n", dummy);
-	*/
-	
-
-	packuswb_r2r(mm4, mm5);			// out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
-
-	packuswb_r2r(mm3, mm6);			// out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
-	movq_r2r(mm2, mm4);
-
-	movq_r2r(mm5, mm7);
-	punpcklbw_r2r(mm0, mm2);		// out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
-
-	punpckhbw_r2r(mm0, mm4);		// out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
-	movq_r2r(mm2, mm1);
-
-	punpcklbw_r2r(mm6, mm5);		// out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
-	
-	punpckhbw_r2r(mm6, mm7);		// out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
-
-	punpcklwd_r2r(mm5, mm2);		// out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
-	
-	movq_r2r(mm7, mm6);
-	punpckhwd_r2r(mm5, mm1);		// out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
-
-	movq_r2r(mm2, mm0);
-	punpcklwd_r2r(mm4, mm6);		// out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
-
-	punpckldq_r2r(mm6, mm2);		// out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
-
-	movq_r2r(mm1, mm3);
-
-	punpckhwd_r2r(mm4, mm7);		// out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
-	
-	dataptr += rskip;
-	movq_r2m(mm2, *(dataptr));
-
-	punpckhdq_r2r(mm6, mm0);		// out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
-
-	dataptr += rskip;
-	movq_r2m(mm0, *(dataptr));
-
-	punpckldq_r2r(mm7, mm1);		// out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
-	
-	punpckhdq_r2r(mm7, mm3);		// out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
-
-	dataptr += rskip;
-	movq_r2m(mm1, *(dataptr));
-
-	dataptr += rskip;
-	movq_r2m(mm3, *(dataptr));
-
-#else
-  __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  __s32 tmp10, tmp11, tmp12, tmp13;
-  __s32 z5, z10, z11, z12, z13;
-  __s16 *inptr;
-  __s32 *wsptr;
-  __u8 *outptr;
-  int ctr;
-  __s32 dcval;
-  __s32 workspace[64];
-
-  inptr = data;
-  wsptr = workspace;
-  for (ctr = 8; ctr > 0; ctr--) {
-    
-    if ((inptr[8] | inptr[16] | inptr[24] |
-	 inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) {
-      dcval = inptr[0];
-      wsptr[0] = dcval;
-      wsptr[8] = dcval;
-      wsptr[16] = dcval;
-      wsptr[24] = dcval;
-      wsptr[32] = dcval;
-      wsptr[40] = dcval;
-      wsptr[48] = dcval;
-      wsptr[56] = dcval;
-      
-      inptr++;	
-      wsptr++;
-      continue;
-    } 
-    
-    tmp0 = inptr[0];
-    tmp1 = inptr[16];
-    tmp2 = inptr[32];
-    tmp3 = inptr[48];
-
-    tmp10 = tmp0 + tmp2;
-    tmp11 = tmp0 - tmp2;
-
-    tmp13 = tmp1 + tmp3;
-    tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13;
-
-    tmp0 = tmp10 + tmp13;
-    tmp3 = tmp10 - tmp13;
-    tmp1 = tmp11 + tmp12;
-    tmp2 = tmp11 - tmp12;
-    
-    tmp4 = inptr[8];
-    tmp5 = inptr[24];
-    tmp6 = inptr[40];
-    tmp7 = inptr[56];
-
-    z13 = tmp6 + tmp5;
-    z10 = tmp6 - tmp5;
-    z11 = tmp4 + tmp7;
-    z12 = tmp4 - tmp7;
-
-    tmp7 = z11 + z13;
-    tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
-
-    z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
-    tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
-    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
-
-    tmp6 = tmp12 - tmp7;
-    tmp5 = tmp11 - tmp6;
-    tmp4 = tmp10 + tmp5;
-
-    wsptr[0] = (__s32) (tmp0 + tmp7);
-    wsptr[56] = (__s32) (tmp0 - tmp7);
-    wsptr[8] = (__s32) (tmp1 + tmp6);
-    wsptr[48] = (__s32) (tmp1 - tmp6);
-    wsptr[16] = (__s32) (tmp2 + tmp5);
-    wsptr[40] = (__s32) (tmp2 - tmp5);
-    wsptr[32] = (__s32) (tmp3 + tmp4);
-    wsptr[24] = (__s32) (tmp3 - tmp4);
-
-    inptr++;
-    wsptr++;
-  }
-
-  wsptr = workspace;
-  for (ctr = 0; ctr < 8; ctr++) {
-    outptr = &(odata[ctr*rskip]);
-
-    tmp10 = wsptr[0] + wsptr[4];
-    tmp11 = wsptr[0] - wsptr[4];
-
-    tmp13 = wsptr[2] + wsptr[6];
-    tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13;
-
-    tmp0 = tmp10 + tmp13;
-    tmp3 = tmp10 - tmp13;
-    tmp1 = tmp11 + tmp12;
-    tmp2 = tmp11 - tmp12;
-
-    z13 = wsptr[5] + wsptr[3];
-    z10 = wsptr[5] - wsptr[3];
-    z11 = wsptr[1] + wsptr[7];
-    z12 = wsptr[1] - wsptr[7];
-
-    tmp7 = z11 + z13;
-    tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
-
-    z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
-    tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
-    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
-
-    tmp6 = tmp12 - tmp7;
-    tmp5 = tmp11 - tmp6;
-    tmp4 = tmp10 + tmp5;
-
-    outptr[0] = RL(DESCALE(tmp0 + tmp7));
-    outptr[7] = RL(DESCALE(tmp0 - tmp7));
-    outptr[1] = RL(DESCALE(tmp1 + tmp6));
-    outptr[6] = RL(DESCALE(tmp1 - tmp6));
-    outptr[2] = RL(DESCALE(tmp2 + tmp5));
-    outptr[5] = RL(DESCALE(tmp2 - tmp5));
-    outptr[4] = RL(DESCALE(tmp3 + tmp4));
-    outptr[3] = RL(DESCALE(tmp3 - tmp4));
-
-    wsptr += 8;
-  }
-#endif
-}
 /*
 
 Main Routines
@@ -2754,7 +1466,6 @@
  RTjpeg_cb8--;
 
  RTjpeg_dct_init();
- RTjpeg_idct_init();
  RTjpeg_quant_init();
 }
 
@@ -2816,35 +1527,6 @@
   buf[64+i]=le2me_32(RTjpeg_ciqt[i]);
 }
 
-void RTjpeg_init_decompress(__u32 *buf, int width, int height)
-{
- int i;
-
- RTjpeg_init_data();
- 
- RTjpeg_width=width;
- RTjpeg_height=height;
- RTjpeg_Ywidth = RTjpeg_width>>3;
- RTjpeg_Ysize=width * height;
- RTjpeg_Cwidth = RTjpeg_width>>4;
- RTjpeg_Csize= (width>>1) * height;
-
- for(i=0; i<64; i++)
- {
-  RTjpeg_liqt[i]=le2me_32(buf[i]);
-  RTjpeg_ciqt[i]=le2me_32(buf[i+64]);
- }
-
- RTjpeg_lb8=0;
- while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
- RTjpeg_lb8--;
- RTjpeg_cb8=0;
- while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
- RTjpeg_cb8--;
-
- RTjpeg_idct_init();
-}
-
 int RTjpeg_compressYUV420(__s8 *sp, unsigned char *bp)
 {
  __s8 * sb;
@@ -2899,68 +1581,6 @@
  return (sp-sb);
 }
 
-void RTjpeg_decompressYUV420(__s8 *sp, __u8 *bp)
-{
- register __s8 * bp1 = bp + (RTjpeg_width<<3);
- register __s8 * bp2 = bp + RTjpeg_Ysize;
- register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
- int i, j,k;
-
-#if HAVE_MMX
- emms();
-#endif
-
-/* Y */
- for(i=RTjpeg_height>>1; i; i-=8)
- {
-  for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) {
-   if(*sp==-1)sp++;
-   else
-   { 
-    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
-    RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
-   }
-   if(*sp==-1)sp++;
-   else
-   { 
-    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
-    RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width);
-   }
-   if(*sp==-1)sp++;
-   else
-   { 
-    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
-    RTjpeg_idct(bp1+j, RTjpeg_block, RTjpeg_width);
-   }
-   if(*sp==-1)sp++;
-   else
-   { 
-    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
-    RTjpeg_idct(bp1+j+8, RTjpeg_block, RTjpeg_width);
-   }
-   if(*sp==-1)sp++;
-   else
-   { 
-    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
-    RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1);
-   } 
-   if(*sp==-1)sp++;
-   else
-   { 
-    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
-    RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1);
-   } 
-  }
-  bp+=RTjpeg_width<<4;
-  bp1+=RTjpeg_width<<4;
-  bp2+=RTjpeg_width<<2;
-  bp3+=RTjpeg_width<<2;
- }
-#if HAVE_MMX
- emms();
-#endif
-}
-
 /*
 External Function
 
--- a/libmpcodecs/native/rtjpegn.h	Sun Mar 08 13:21:00 2009 +0000
+++ b/libmpcodecs/native/rtjpegn.h	Sun Mar 08 13:32:42 2009 +0000
@@ -36,9 +36,7 @@
 #define __s64 int64_t
 
 void RTjpeg_init_compress(__u32 *buf, int width, int height, __u8 Q);
-void RTjpeg_init_decompress(__u32 *buf, int width, int height);
 int RTjpeg_compressYUV420(__s8 *sp, unsigned char *bp);
-void RTjpeg_decompressYUV420(__s8 *sp, __u8 *bp);
 
 void RTjpeg_init_mcompress(void);
 int RTjpeg_mcompressYUV420(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask);
--- a/libmpcodecs/vd.c	Sun Mar 08 13:21:00 2009 +0000
+++ b/libmpcodecs/vd.c	Sun Mar 08 13:32:42 2009 +0000
@@ -35,7 +35,6 @@
 extern vd_functions_t mpcodecs_vd_raw;
 extern vd_functions_t mpcodecs_vd_hmblck;
 extern vd_functions_t mpcodecs_vd_xanim;
-extern vd_functions_t mpcodecs_vd_nuv;
 extern vd_functions_t mpcodecs_vd_mpng;
 extern vd_functions_t mpcodecs_vd_ijpg;
 extern vd_functions_t mpcodecs_vd_mtga;
@@ -70,7 +69,6 @@
         &mpcodecs_vd_lzo,
         &mpcodecs_vd_raw,
         &mpcodecs_vd_hmblck,
-        &mpcodecs_vd_nuv,
 #ifdef CONFIG_XANIM
         &mpcodecs_vd_xanim,
 #endif
--- a/libmpcodecs/vd_nuv.c	Sun Mar 08 13:21:00 2009 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,54 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "config.h"
-#include "mp_msg.h"
-
-#include "vd_internal.h"
-
-static vd_info_t info = {
-	"NuppelVideo decoder",
-	"nuv",
-	"A'rpi",
-	"Alex & Panagiotis Issaris <takis@lumumba.luc.ac.be>",
-	"native codecs"
-};
-
-LIBVD_EXTERN(nuv)
-
-// to set/get/query special features/parameters
-static int control(sh_video_t *sh,int cmd,void* arg,...){
-    return CONTROL_UNKNOWN;
-}
-
-// init driver
-static int init(sh_video_t *sh){
-    return mpcodecs_config_vo(sh,sh->disp_w,sh->disp_h,IMGFMT_I420);
-}
-
-// uninit driver
-static void uninit(sh_video_t *sh){
-}
-
-//mp_image_t* mpcodecs_get_image(sh_video_t *sh, int mp_imgtype, int mp_imgflag, int w, int h);
-
-void decode_nuv(
-  unsigned char *encoded,
-  int encoded_size,
-  unsigned char *decoded,
-  int width,
-  int height);
-
-// decode a frame
-static mp_image_t* decode(sh_video_t *sh,void* data,int len,int flags){
-    mp_image_t* mpi;
-    if(len<=0) return NULL; // skipped frame
-    
-    mpi=mpcodecs_get_image(sh, MP_IMGTYPE_TEMP, 0, 
-	sh->disp_w, sh->disp_h);
-    if(!mpi) return NULL;
-
-    decode_nuv(data, len, mpi->planes[0], sh->disp_w, sh->disp_h);
-
-    return mpi;
-}