# HG changeset patch # User alex # Date 1009488449 0 # Node ID bf37d5cd3e6e49081ac25ff46de14828301b0885 # Parent 3cea69dda1b35b091240db68d348f6d1c043cb03 used by NuppelVideo decoder diff -r 3cea69dda1b3 -r bf37d5cd3e6e RTjpegN.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RTjpegN.c Thu Dec 27 21:27:29 2001 +0000 @@ -0,0 +1,3794 @@ +/* + RTjpeg (C) Justin Schoeman 1998 (justin@suntiger.ee.up.ac.za) + + With modifications by: + (c) 1998, 1999 by Joerg Walter + and + (c) 1999 by Wim Taymans + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#include +#include +#include +#include "RTjpegN.h" + +#ifdef MMX +#include "mmx.h" +#endif + +//#define SHOWBLOCK 1 +#define BETTERCOMPRESSION 1 + +static const unsigned char RTjpeg_ZZ[64]={ +0, +8, 1, +2, 9, 16, +24, 17, 10, 3, +4, 11, 18, 25, 32, +40, 33, 26, 19, 12, 5, +6, 13, 20, 27, 34, 41, 48, +56, 49, 42, 35, 28, 21, 14, 7, +15, 22, 29, 36, 43, 50, 57, +58, 51, 44, 37, 30, 23, +31, 38, 45, 52, 59, +60, 53, 46, 39, +47, 54, 61, +62, 55, +63 }; + +static const __u64 RTjpeg_aan_tab[64]={ +4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL, +5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL, +5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL, +5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL, +4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL, +3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL, +2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL, +1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL, +}; + +#ifndef MMX +static __s32 RTjpeg_ws[64+31]; +#endif +__u8 RTjpeg_alldata[2*64+4*64+4*64+4*64+4*64+32]; + +__s16 *block; // rh +__s16 *RTjpeg_block; +__s32 *RTjpeg_lqt; +__s32 *RTjpeg_cqt; +__u32 *RTjpeg_liqt; +__u32 *RTjpeg_ciqt; + +unsigned char RTjpeg_lb8; +unsigned char RTjpeg_cb8; +int RTjpeg_width, RTjpeg_height; +int RTjpeg_Ywidth, RTjpeg_Cwidth; +int RTjpeg_Ysize, RTjpeg_Csize; + +__s16 *RTjpeg_old=NULL; + +#ifdef MMX +mmx_t RTjpeg_lmask; +mmx_t RTjpeg_cmask; +#else +__u16 RTjpeg_lmask; +__u16 RTjpeg_cmask; +#endif +int RTjpeg_mtest=0; + +static const unsigned char RTjpeg_lum_quant_tbl[64] = { + 16, 11, 10, 16, 24, 40, 51, 61, + 12, 12, 14, 19, 26, 58, 60, 55, + 14, 13, 16, 24, 40, 57, 69, 56, + 14, 17, 22, 29, 51, 87, 80, 62, + 18, 22, 37, 56, 68, 109, 103, 77, + 24, 35, 55, 64, 81, 104, 113, 92, + 49, 64, 78, 87, 103, 121, 120, 101, + 72, 92, 95, 98, 112, 100, 103, 99 + }; + +static const unsigned char RTjpeg_chrom_quant_tbl[64] = { + 17, 18, 24, 47, 99, 99, 99, 99, + 18, 21, 26, 66, 99, 99, 99, 99, + 24, 26, 56, 99, 99, 99, 99, 99, + 47, 66, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99 + }; + +#ifdef BETTERCOMPRESSION + +/*--------------------------------------------------*/ +/* better encoding, but needs a lot more cpu time */ +/* seems to be more effective than old method +lzo */ +/* with this encoding lzo isn't efficient anymore */ +/* there is still more potential for better */ +/* encoding but that would need even more cputime */ +/* anyway your mileage may vary */ +/* */ +/* written by Martin BIELY and Roman HOCHLEITNER */ +/*--------------------------------------------------*/ + +/* +++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* Block to Stream (encoding) */ +/* */ + +int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8) +{ + register int ci, co=1; + register __s16 ZZvalue; + register unsigned char bitten; + register unsigned char bitoff; + +#ifdef SHOWBLOCK + + int ii; + for (ii=0; ii < 64; ii++) { + fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]); + } + fprintf(stdout, "\n\n"); + +#endif + +// *strm++ = 0x10; +// *strm = 0x00; +// +// return 2; + + // first byte allways written + (__u8)strm[0]= + (__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]); + + + ci=63; + while (data[RTjpeg_ZZ[ci]]==0 && ci>0) ci--; + + bitten = ((unsigned char)ci) << 2; + + if (ci==0) { + (__u8)strm[1]= bitten; + co = 2; + return (int)co; + } + + /* bitoff=0 because the high 6bit contain first non zero position */ + bitoff = 0; + co = 1; + + for(; ci>0; ci--) { + + ZZvalue = data[RTjpeg_ZZ[ci]]; + + switch(ZZvalue) { + case 0: + break; + case 1: + bitten |= (0x01<0; ci--) { + + ZZvalue = data[RTjpeg_ZZ[ci]]; + + if( (ZZvalue > 7) || (ZZvalue < -7) ) { + bitten |= (0x08<0; ci--) { + + ZZvalue = data[RTjpeg_ZZ[ci]]; + + if(ZZvalue>0) + { + strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue; + } + else + { + strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue; + } + + } + + +BAUCHWEH: + /* we gotoo much now we are ill */ +#ifdef SHOWBLOCK +{ +int i; +fprintf(stdout, "\nco = '%d'\n", co); + for (i=0; i < co+2; i++) { + fprintf(stdout, "%d ", strm[i]); + } +fprintf(stdout, "\n\n"); +} +#endif + + return (int)co; +} + +/* +++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* Stream to Block (decoding) */ +/* */ + +int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl) +{ + int ci; + register int co; + register int i; + register unsigned char bitten; + register unsigned char bitoff; + + /* first byte always read */ + i=RTjpeg_ZZ[0]; + data[i]=((__u8)strm[0])*qtbl[i]; + + /* we start at the behind */ + + bitten = ((unsigned char)strm[1]) >> 2; + co = 63; + for(; co > bitten; co--) { + + data[RTjpeg_ZZ[co]] = 0; + + } + + if (co==0) { + ci = 2; + goto AUTOBAHN; + } + + /* we have to read the last 2 bits of the second byte */ + ci=1; + bitoff = 0; + + for(; co>0; co--) { + + bitten = ((unsigned char)strm[ci]) >> bitoff; + bitten &= 0x03; + + i=RTjpeg_ZZ[co]; + + switch( bitten ) { + case 0x03: + data[i]= -qtbl[i]; + break; + case 0x02: + goto FUSSWEG; + break; + case 0x01: + data[i]= qtbl[i]; + break; + case 0x00: + data[i]= 0; + break; + default: + + } + + if( bitoff == 0 ) { + bitoff = 8; + ci++; + } + bitoff -= 2; + } + /* co is 0 now */ + /* data is written properly */ + + /* if bitoff!=6 then ci is the index, but should be the byte count, so we increment by 1 */ + if (bitoff!=6) ci++; + + goto AUTOBAHN; + + +FUSSWEG: +/* correct bitoff to nibble */ + switch(bitoff){ + case 4: + case 6: + bitoff = 0; + break; + case 2: + case 0: + /* we have to read from the next byte */ + ci++; + bitoff = 4; + break; + default: + break; + } + + for(; co>0; co--) { + + bitten = ((unsigned char)strm[ci]) >> bitoff; + bitten &= 0x0f; + + i=RTjpeg_ZZ[co]; + + if( bitten == 0x08 ) { + goto STRASSE; + } + + /* the compiler cannot do sign extension for signed nibbles */ + if( bitten & 0x08 ) { + bitten |= 0xf0; + } + /* the unsigned char bitten now is a valid signed char */ + + data[i]=((signed char)bitten)*qtbl[i]; + + if( bitoff == 0 ) { + bitoff = 8; + ci++; + } + bitoff -= 4; + } + /* co is 0 */ + + /* if bitoff!=4 then ci is the index, but should be the byte count, so we increment by 1 */ + if (bitoff!=4) ci++; + + goto AUTOBAHN; + +STRASSE: + ci++; + + for(; co>0; co--) { + i=RTjpeg_ZZ[co]; + data[i]=strm[ci++]*qtbl[i]; + } + + /* ci now is the count, because it points to next element => no incrementing */ + +AUTOBAHN: + +#ifdef SHOWBLOCK +fprintf(stdout, "\nci = '%d'\n", ci); + for (i=0; i < 64; i++) { + fprintf(stdout, "%d ", data[RTjpeg_ZZ[i]]); + } +fprintf(stdout, "\n\n"); +#endif + + return ci; +} + +#else + +int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8) +{ + register int ci, co=1, tmp; + register __s16 ZZvalue; + +#ifdef SHOWBLOCK + + int ii; + for (ii=0; ii < 64; ii++) { + fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]); + } + fprintf(stdout, "\n\n"); + +#endif + + (__u8)strm[0]=(__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]); + + for(ci=1; ci<=bt8; ci++) + { + ZZvalue = data[RTjpeg_ZZ[ci]]; + + if(ZZvalue>0) + { + strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue; + } + else + { + strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue; + } + } + + for(; ci<64; ci++) + { + ZZvalue = data[RTjpeg_ZZ[ci]]; + + if(ZZvalue>0) + { + strm[co++]=(__s8)(ZZvalue>63)?63:ZZvalue; + } + else if(ZZvalue<0) + { + strm[co++]=(__s8)(ZZvalue<-64)?-64:ZZvalue; + } + else /* compress zeros */ + { + tmp=ci; + do + { + ci++; + } + while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0)); + + strm[co++]=(__s8)(63+(ci-tmp)); + ci--; + } + } + return (int)co; +} + +int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl) +{ + int ci=1, co=1, tmp; + register int i; + + i=RTjpeg_ZZ[0]; + data[i]=((__u8)strm[0])*qtbl[i]; + + for(co=1; co<=bt8; co++) + { + i=RTjpeg_ZZ[co]; + data[i]=strm[ci++]*qtbl[i]; + } + + for(; co<64; co++) + { + if(strm[ci]>63) + { + tmp=co+strm[ci]-63; + for(; co>16); +} +#endif + +/* + * Perform the forward DCT on one block of samples. + */ +#ifdef MMX +static mmx_t RTjpeg_C4 =(mmx_t)(long long)0x2D412D412D412D41LL; +static mmx_t RTjpeg_C6 =(mmx_t)(long long)0x187E187E187E187ELL; +static mmx_t RTjpeg_C2mC6=(mmx_t)(long long)0x22A322A322A322A3LL; +static mmx_t RTjpeg_C2pC6=(mmx_t)(long long)0x539F539F539F539FLL; +static mmx_t RTjpeg_zero =(mmx_t)(long long)0x0000000000000000LL; + +#else + +#define FIX_0_382683433 ((__s32) 98) /* FIX(0.382683433) */ +#define FIX_0_541196100 ((__s32) 139) /* FIX(0.541196100) */ +#define FIX_0_707106781 ((__s32) 181) /* FIX(0.707106781) */ +#define FIX_1_306562965 ((__s32) 334) /* FIX(1.306562965) */ + +#define DESCALE10(x) (__s16)( ((x)+128) >> 8) +#define DESCALE20(x) (__s16)(((x)+32768) >> 16) +#define D_MULTIPLY(var,const) ((__s32) ((var) * (const))) +#endif + +void RTjpeg_dct_init(void) +{ + int i; + + for(i=0; i<64; i++) + { + RTjpeg_lqt[i]=(((__u64)RTjpeg_lqt[i]<<32)/RTjpeg_aan_tab[i]); + RTjpeg_cqt[i]=(((__u64)RTjpeg_cqt[i]<<32)/RTjpeg_aan_tab[i]); + } +} + +void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip) +{ +#ifndef MMX + __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __s32 tmp10, tmp11, tmp12, tmp13; + __s32 z1, z2, z3, z4, z5, z11, z13; + __u8 *idataptr; + __s16 *odataptr; + __s32 *wsptr; + int ctr; + + idataptr = idata; + wsptr = RTjpeg_ws; + for (ctr = 7; ctr >= 0; ctr--) { + tmp0 = idataptr[0] + idataptr[7]; + tmp7 = idataptr[0] - idataptr[7]; + tmp1 = idataptr[1] + idataptr[6]; + tmp6 = idataptr[1] - idataptr[6]; + tmp2 = idataptr[2] + idataptr[5]; + tmp5 = idataptr[2] - idataptr[5]; + tmp3 = idataptr[3] + idataptr[4]; + tmp4 = idataptr[3] - idataptr[4]; + + tmp10 = (tmp0 + tmp3); /* phase 2 */ + tmp13 = tmp0 - tmp3; + tmp11 = (tmp1 + tmp2); + tmp12 = tmp1 - tmp2; + + wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */ + wsptr[4] = (tmp10 - tmp11)<<8; + + z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ + wsptr[2] = (tmp13<<8) + z1; /* phase 5 */ + wsptr[6] = (tmp13<<8) - z1; + + tmp10 = tmp4 + tmp5; /* phase 2 */ + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ + z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ + z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ + z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ + + z11 = (tmp7<<8) + z3; /* phase 5 */ + z13 = (tmp7<<8) - z3; + + wsptr[5] = z13 + z2; /* phase 6 */ + wsptr[3] = z13 - z2; + wsptr[1] = z11 + z4; + wsptr[7] = z11 - z4; + + idataptr += rskip<<3; /* advance pointer to next row */ + wsptr += 8; + } + + wsptr = RTjpeg_ws; + odataptr=odata; + for (ctr = 7; ctr >= 0; ctr--) { + tmp0 = wsptr[0] + wsptr[56]; + tmp7 = wsptr[0] - wsptr[56]; + tmp1 = wsptr[8] + wsptr[48]; + tmp6 = wsptr[8] - wsptr[48]; + tmp2 = wsptr[16] + wsptr[40]; + tmp5 = wsptr[16] - wsptr[40]; + tmp3 = wsptr[24] + wsptr[32]; + tmp4 = wsptr[24] - wsptr[32]; + + tmp10 = tmp0 + tmp3; /* phase 2 */ + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */ + odataptr[32] = DESCALE10(tmp10 - tmp11); + + z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ + odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */ + odataptr[48] = DESCALE20((tmp13<<8) - z1); + + tmp10 = tmp4 + tmp5; /* phase 2 */ + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ + z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ + z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ + z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ + + z11 = (tmp7<<8) + z3; /* phase 5 */ + z13 = (tmp7<<8) - z3; + + odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */ + odataptr[24] = DESCALE20(z13 - z2); + odataptr[8] = DESCALE20(z11 + z4); + odataptr[56] = DESCALE20(z11 - z4); + + odataptr++; /* advance pointer to next column */ + wsptr++; + } +#else + volatile mmx_t tmp6, tmp7; + register mmx_t *dataptr = (mmx_t *)odata; + mmx_t *idata2 = (mmx_t *)idata; + + // first copy the input 8 bit to the destination 16 bits + + movq_m2r(RTjpeg_zero, mm2); + + + movq_m2r(*idata2, mm0); + movq_r2r(mm0, mm1); + + punpcklbw_r2r(mm2, mm0); + movq_r2m(mm0, *(dataptr)); + + punpckhbw_r2r(mm2, mm1); + movq_r2m(mm1, *(dataptr+1)); + + idata2 += rskip; + + movq_m2r(*idata2, mm0); + movq_r2r(mm0, mm1); + + punpcklbw_r2r(mm2, mm0); + movq_r2m(mm0, *(dataptr+2)); + + punpckhbw_r2r(mm2, mm1); + movq_r2m(mm1, *(dataptr+3)); + + idata2 += rskip; + + movq_m2r(*idata2, mm0); + movq_r2r(mm0, mm1); + + punpcklbw_r2r(mm2, mm0); + movq_r2m(mm0, *(dataptr+4)); + + punpckhbw_r2r(mm2, mm1); + movq_r2m(mm1, *(dataptr+5)); + + idata2 += rskip; + + movq_m2r(*idata2, mm0); + movq_r2r(mm0, mm1); + + punpcklbw_r2r(mm2, mm0); + movq_r2m(mm0, *(dataptr+6)); + + punpckhbw_r2r(mm2, mm1); + movq_r2m(mm1, *(dataptr+7)); + + idata2 += rskip; + + movq_m2r(*idata2, mm0); + movq_r2r(mm0, mm1); + + punpcklbw_r2r(mm2, mm0); + movq_r2m(mm0, *(dataptr+8)); + + punpckhbw_r2r(mm2, mm1); + movq_r2m(mm1, *(dataptr+9)); + + idata2 += rskip; + + movq_m2r(*idata2, mm0); + movq_r2r(mm0, mm1); + + punpcklbw_r2r(mm2, mm0); + movq_r2m(mm0, *(dataptr+10)); + + punpckhbw_r2r(mm2, mm1); + movq_r2m(mm1, *(dataptr+11)); + + idata2 += rskip; + + movq_m2r(*idata2, mm0); + movq_r2r(mm0, mm1); + + punpcklbw_r2r(mm2, mm0); + movq_r2m(mm0, *(dataptr+12)); + + punpckhbw_r2r(mm2, mm1); + movq_r2m(mm1, *(dataptr+13)); + + idata2 += rskip; + + movq_m2r(*idata2, mm0); + movq_r2r(mm0, mm1); + + punpcklbw_r2r(mm2, mm0); + movq_r2m(mm0, *(dataptr+14)); + + punpckhbw_r2r(mm2, mm1); + movq_r2m(mm1, *(dataptr+15)); + +/* Start Transpose to do calculations on rows */ + + movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into m5 + + movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2 + movq_r2r(mm7, mm5); + + punpcklwd_m2r(*(dataptr+11), mm7); // m11:m01|m10:m00 - interleave first and second lines + movq_r2r(mm6, mm2); + + punpcklwd_m2r(*(dataptr+15), mm6); // m31:m21|m30:m20 - interleave third and fourth lines + movq_r2r(mm7, mm1); + + movq_m2r(*(dataptr+11), mm3); // m13:m13|m11:m10 - second line + punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1 + + movq_m2r(*(dataptr+15), mm0); // m13:m13|m11:m10 - fourth line + punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2 + + movq_r2m(mm7,*(dataptr+9)); // write result 1 + punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines + + movq_r2m(mm1,*(dataptr+11)); // write result 2 + punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines + + movq_r2r(mm5, mm1); + punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3 + + movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4 + punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4 + + movq_r2m(mm5,*(dataptr+13)); // write result 3 + + // last 4x4 done + + movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4 + + movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line + movq_r2r(mm0, mm6); + + punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines + movq_r2r(mm2, mm7); + + punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines + movq_r2r(mm0, mm4); + + // + movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line + punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result + + movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line + punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result + + punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines + movq_r2r(mm1, mm2); // copy first line + + punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines + movq_r2r(mm6, mm5); // copy first intermediate result + + movq_r2m(mm0, *(dataptr+8)); // write result 1 + punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result + + punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines + movq_r2r(mm3, mm0); // copy third line + + punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines + + movq_r2m(mm4, *(dataptr+10)); // write result 2 out + punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result + + punpcklwd_m2r(*(dataptr+14), mm3); // n31:n21|n30:n20 - interleave third and fourth lines + movq_r2r(mm1, mm4); + + movq_r2m(mm6, *(dataptr+12)); // write result 3 out + punpckldq_r2r(mm3, mm1); // n30:n20|n10:n00 - produce first result + + punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines + movq_r2r(mm2, mm6); + + movq_r2m(mm5, *(dataptr+14)); // write result 4 out + punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result + + movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block) + punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result + + movq_r2m(mm4, *(dataptr+3)); // write result 6 out + punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result + + movq_r2m(mm2, *(dataptr+5)); // write result 7 out + + movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4 + + movq_r2m(mm6, *(dataptr+7)); // write result 8 out + + +// Do first 4x4 quadrant, which is used in the beginning of the DCT: + + movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line + movq_r2r(mm0, mm2); + + punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines + movq_r2r(mm7, mm4); + + punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines + movq_r2r(mm0, mm1); + + movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line + punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1 + + movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line + punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2 + + movq_r2r(mm0, mm7); // write result 1 + punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines + + psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */ + movq_r2r(mm1, mm6); // write result 2 + + paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */ + punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines + + paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */ + movq_r2r(mm2, mm3); // copy first intermediate result + + psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */ + punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3 + + movq_r2m(mm7, tmp7); + movq_r2r(mm2, mm5); // write result 3 + + movq_r2m(mm6, tmp6); + punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4 + + paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+5 /* Stage 1 */ + movq_r2r(mm3, mm4); // write result 4 + +/************************************************************************************************ + End of Transpose +************************************************************************************************/ + + + paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/ + movq_r2r(mm0, mm7); + + psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/ + movq_r2r(mm1, mm6); + + paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */ + psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */ + + psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */ + paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */ + + psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/ + paddw_r2r(mm7, mm6); // tmp12 + tmp13 + + /* stage 3 */ + + movq_m2r(tmp6, mm2); + movq_r2r(mm0, mm3); + + psllw_i2r(2, mm6); // m8 * 2^2 + paddw_r2r(mm1, mm0); + + pmulhw_m2r(RTjpeg_C4, mm6); // z1 + psubw_r2r(mm1, mm3); + + movq_r2m(mm0, *dataptr); + movq_r2r(mm7, mm0); + + /* Odd part */ + movq_r2m(mm3, *(dataptr+8)); + paddw_r2r(mm5, mm4); // tmp10 + + movq_m2r(tmp7, mm3); + paddw_r2r(mm6, mm0); // tmp32 + + paddw_r2r(mm2, mm5); // tmp11 + psubw_r2r(mm6, mm7); // tmp33 + + movq_r2m(mm0, *(dataptr+4)); + paddw_r2r(mm3, mm2); // tmp12 + + /* stage 4 */ + + movq_r2m(mm7, *(dataptr+12)); + movq_r2r(mm4, mm1); // copy of tmp10 + + psubw_r2r(mm2, mm1); // tmp10 - tmp12 + psllw_i2r(2, mm4); // m8 * 2^2 + + movq_m2r(RTjpeg_C2mC6, mm0); + psllw_i2r(2, mm1); + + pmulhw_m2r(RTjpeg_C6, mm1); // z5 + psllw_i2r(2, mm2); + + pmulhw_r2r(mm0, mm4); // z5 + + /* stage 5 */ + + pmulhw_m2r(RTjpeg_C2pC6, mm2); + psllw_i2r(2, mm5); + + pmulhw_m2r(RTjpeg_C4, mm5); // z3 + movq_r2r(mm3, mm0); // copy tmp7 + + movq_m2r(*(dataptr+1), mm7); + paddw_r2r(mm1, mm4); // z2 + + paddw_r2r(mm1, mm2); // z4 + + paddw_r2r(mm5, mm0); // z11 + psubw_r2r(mm5, mm3); // z13 + + /* stage 6 */ + + movq_r2r(mm3, mm5); // copy z13 + psubw_r2r(mm4, mm3); // y3=z13 - z2 + + paddw_r2r(mm4, mm5); // y5=z13 + z2 + movq_r2r(mm0, mm6); // copy z11 + + movq_r2m(mm3, *(dataptr+6)); //save y3 + psubw_r2r(mm2, mm0); // y7=z11 - z4 + + movq_r2m(mm5, *(dataptr+10)); //save y5 + paddw_r2r(mm2, mm6); // y1=z11 + z4 + + movq_r2m(mm0, *(dataptr+14)); //save y7 + + /************************************************ + * End of 1st 4 rows + ************************************************/ + + movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */ + movq_r2r(mm7, mm0); // copy x0 + + movq_r2m(mm6, *(dataptr+2)); //save y1 + + movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */ + movq_r2r(mm1, mm6); // copy x1 + + paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7 + + movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */ + movq_r2r(mm2, mm5); // copy x2 + + psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7 + movq_r2r(mm3, mm4); // copy x3 + + paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6 + + movq_r2m(mm7, tmp7); // save tmp07 + movq_r2r(mm0, mm7); // copy tmp00 + + psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6 + + /* stage 2, Even Part */ + + paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4 + + movq_r2m(mm6, tmp6); // save tmp07 + movq_r2r(mm1, mm6); // copy tmp01 + + paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5 + paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 + + psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 + + psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4 + psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 + + paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 + + psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5 + paddw_r2r(mm7, mm6); // tmp12 + tmp13 + + /* stage 3, Even and stage 4 & 5 even */ + + movq_m2r(tmp6, mm2); // load tmp6 + movq_r2r(mm0, mm3); // copy tmp10 + + psllw_i2r(2, mm6); // shift z1 + paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11 + + pmulhw_m2r(RTjpeg_C4, mm6); // z1 + psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11 + + movq_r2m(mm0, *(dataptr+1)); //save y0 + movq_r2r(mm7, mm0); // copy tmp13 + + /* odd part */ + + movq_r2m(mm3, *(dataptr+9)); //save y4 + paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5 + + movq_m2r(tmp7, mm3); // load tmp7 + paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1 + + paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6 + psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1 + + movq_r2m(mm0, *(dataptr+5)); //save y2 + paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7 + + /* stage 4 */ + + movq_r2m(mm7, *(dataptr+13)); //save y6 + movq_r2r(mm4, mm1); // copy tmp10 + + psubw_r2r(mm2, mm1); // tmp10 - tmp12 + psllw_i2r(2, mm4); // shift tmp10 + + movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6 + psllw_i2r(2, mm1); // shift (tmp10-tmp12) + + pmulhw_m2r(RTjpeg_C6, mm1); // z5 + psllw_i2r(2, mm5); // prepare for multiply + + pmulhw_r2r(mm0, mm4); // multiply by converted real + + /* stage 5 */ + + pmulhw_m2r(RTjpeg_C4, mm5); // z3 + psllw_i2r(2, mm2); // prepare for multiply + + pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply + movq_r2r(mm3, mm0); // copy tmp7 + + movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7 + paddw_r2r(mm1, mm4); // z2 + + paddw_r2r(mm5, mm0); // z11 + psubw_r2r(mm5, mm3); // z13 + + /* stage 6 */ + + movq_r2r(mm3, mm5); // copy z13 + paddw_r2r(mm1, mm2); // z4 + + movq_r2r(mm0, mm6); // copy z11 + psubw_r2r(mm4, mm5); // y3 + + paddw_r2r(mm2, mm6); // y1 + paddw_r2r(mm4, mm3); // y5 + + movq_r2m(mm5, *(dataptr+7)); //save y3 + + movq_r2m(mm6, *(dataptr+3)); //save y1 + psubw_r2r(mm2, mm0); // y7 + +/************************************************************************************************ + Start of Transpose +************************************************************************************************/ + + movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2 + movq_r2r(mm7, mm5); // copy first line + + punpcklwd_r2r(mm3, mm7); // m11:m01|m10:m00 - interleave first and second lines + movq_r2r(mm6, mm2); // copy third line + + punpcklwd_r2r(mm0, mm6); // m31:m21|m30:m20 - interleave third and fourth lines + movq_r2r(mm7, mm1); // copy first intermediate result + + punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1 + + punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2 + + movq_r2m(mm7, *(dataptr+9)); // write result 1 + punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines + + movq_r2m(mm1, *(dataptr+11)); // write result 2 + punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines + + movq_r2r(mm5, mm1); // copy first intermediate result + punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3 + + movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4 + punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4 + + movq_r2m(mm5, *(dataptr+13)); // write result 3 + + /****** last 4x4 done */ + + movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4 + + movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line + movq_r2r(mm0, mm6); // copy first line + + punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines + movq_r2r(mm2, mm7); // copy third line + + punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines + movq_r2r(mm0, mm4); // copy first intermediate result + + + + movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line + punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result + + movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line + punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result + + punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines + movq_r2r(mm1, mm2); // copy first line + + punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines + movq_r2r(mm6, mm5); // copy first intermediate result + + movq_r2m(mm0, *(dataptr+8)); // write result 1 + punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result + + punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines + movq_r2r(mm3, mm0); // copy third line + + punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines + + movq_r2m(mm4, *(dataptr+10)); // write result 2 out + punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result + + punpcklwd_m2r(*(dataptr+14), mm3); // n33:n23|n32:n22 - interleave third and fourth lines + movq_r2r(mm1, mm4); // copy second intermediate result + + movq_r2m(mm6, *(dataptr+12)); // write result 3 out + punpckldq_r2r(mm3, mm1); // + + punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines + movq_r2r(mm2, mm6); // copy second intermediate result + + movq_r2m(mm5, *(dataptr+14)); // write result 4 out + punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result + + movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block) + punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result + + movq_r2m(mm4, *(dataptr+3)); // write result 6 out + punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result + + movq_r2m(mm2, *(dataptr+5)); // write result 7 out + + movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4 + + movq_r2m(mm6, *(dataptr+7)); // write result 8 out + +// Do first 4x4 quadrant, which is used in the beginning of the DCT: + + movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line + movq_r2r(mm0, mm2); // copy first line + + punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines + movq_r2r(mm7, mm4); // copy third line + + punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines + movq_r2r(mm0, mm1); // copy first intermediate result + + movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line + punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1 + + movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line + punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2 + + movq_r2r(mm0, mm7); // write result 1 + punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines + + psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */ + movq_r2r(mm1, mm6); // write result 2 + + paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */ + punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines + + paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */ + movq_r2r(mm2, mm3); // copy first intermediate result + + psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */ + punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3 + + movq_r2m(mm7, tmp7); // save tmp07 + movq_r2r(mm2, mm5); // write result 3 + + movq_r2m(mm6, tmp6); // save tmp06 + + punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4 + + paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+x5 /* stage 1 */ + movq_r2r(mm3, mm4); // write result 4 + +/************************************************************************************************ + End of Transpose 2 +************************************************************************************************/ + + paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/ + movq_r2r(mm0, mm7); + + psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/ + movq_r2r(mm1, mm6); + + paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */ + psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */ + + psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */ + paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */ + + psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/ + paddw_r2r(mm7, mm6); // tmp12 + tmp13 + + /* stage 3 */ + + movq_m2r(tmp6, mm2); + movq_r2r(mm0, mm3); + + psllw_i2r(2, mm6); // m8 * 2^2 + paddw_r2r(mm1, mm0); + + pmulhw_m2r(RTjpeg_C4, mm6); // z1 + psubw_r2r(mm1, mm3); + + movq_r2m(mm0, *dataptr); + movq_r2r(mm7, mm0); + + /* Odd part */ + movq_r2m(mm3, *(dataptr+8)); + paddw_r2r(mm5, mm4); // tmp10 + + movq_m2r(tmp7, mm3); + paddw_r2r(mm6, mm0); // tmp32 + + paddw_r2r(mm2, mm5); // tmp11 + psubw_r2r(mm6, mm7); // tmp33 + + movq_r2m(mm0, *(dataptr+4)); + paddw_r2r(mm3, mm2); // tmp12 + + /* stage 4 */ + movq_r2m(mm7, *(dataptr+12)); + movq_r2r(mm4, mm1); // copy of tmp10 + + psubw_r2r(mm2, mm1); // tmp10 - tmp12 + psllw_i2r(2, mm4); // m8 * 2^2 + + movq_m2r(RTjpeg_C2mC6, mm0); + psllw_i2r(2, mm1); + + pmulhw_m2r(RTjpeg_C6, mm1); // z5 + psllw_i2r(2, mm2); + + pmulhw_r2r(mm0, mm4); // z5 + + /* stage 5 */ + + pmulhw_m2r(RTjpeg_C2pC6, mm2); + psllw_i2r(2, mm5); + + pmulhw_m2r(RTjpeg_C4, mm5); // z3 + movq_r2r(mm3, mm0); // copy tmp7 + + movq_m2r(*(dataptr+1), mm7); + paddw_r2r(mm1, mm4); // z2 + + paddw_r2r(mm1, mm2); // z4 + + paddw_r2r(mm5, mm0); // z11 + psubw_r2r(mm5, mm3); // z13 + + /* stage 6 */ + + movq_r2r(mm3, mm5); // copy z13 + psubw_r2r(mm4, mm3); // y3=z13 - z2 + + paddw_r2r(mm4, mm5); // y5=z13 + z2 + movq_r2r(mm0, mm6); // copy z11 + + movq_r2m(mm3, *(dataptr+6)); //save y3 + psubw_r2r(mm2, mm0); // y7=z11 - z4 + + movq_r2m(mm5, *(dataptr+10)); //save y5 + paddw_r2r(mm2, mm6); // y1=z11 + z4 + + movq_r2m(mm0, *(dataptr+14)); //save y7 + + /************************************************ + * End of 1st 4 rows + ************************************************/ + + movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */ + movq_r2r(mm7, mm0); // copy x0 + + movq_r2m(mm6, *(dataptr+2)); //save y1 + + movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */ + movq_r2r(mm1, mm6); // copy x1 + + paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7 + + movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */ + movq_r2r(mm2, mm5); // copy x2 + + psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7 + movq_r2r(mm3, mm4); // copy x3 + + paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6 + + movq_r2m(mm7, tmp7); // save tmp07 + movq_r2r(mm0, mm7); // copy tmp00 + + psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6 + + /* stage 2, Even Part */ + + paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4 + + movq_r2m(mm6, tmp6); // save tmp07 + movq_r2r(mm1, mm6); // copy tmp01 + + paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5 + paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 + + psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 + + psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4 + psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 + + paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 + + psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5 + paddw_r2r(mm7, mm6); // tmp12 + tmp13 + + /* stage 3, Even and stage 4 & 5 even */ + + movq_m2r(tmp6, mm2); // load tmp6 + movq_r2r(mm0, mm3); // copy tmp10 + + psllw_i2r(2, mm6); // shift z1 + paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11 + + pmulhw_m2r(RTjpeg_C4, mm6); // z1 + psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11 + + movq_r2m(mm0, *(dataptr+1)); //save y0 + movq_r2r(mm7, mm0); // copy tmp13 + + /* odd part */ + + movq_r2m(mm3, *(dataptr+9)); //save y4 + paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5 + + movq_m2r(tmp7, mm3); // load tmp7 + paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1 + + paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6 + psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1 + + movq_r2m(mm0, *(dataptr+5)); //save y2 + paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7 + + /* stage 4 */ + + movq_r2m(mm7, *(dataptr+13)); //save y6 + movq_r2r(mm4, mm1); // copy tmp10 + + psubw_r2r(mm2, mm1); // tmp10 - tmp12 + psllw_i2r(2, mm4); // shift tmp10 + + movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6 + psllw_i2r(2, mm1); // shift (tmp10-tmp12) + + pmulhw_m2r(RTjpeg_C6, mm1); // z5 + psllw_i2r(2, mm5); // prepare for multiply + + pmulhw_r2r(mm0, mm4); // multiply by converted real + + /* stage 5 */ + + pmulhw_m2r(RTjpeg_C4, mm5); // z3 + psllw_i2r(2, mm2); // prepare for multiply + + pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply + movq_r2r(mm3, mm0); // copy tmp7 + + movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7 + paddw_r2r(mm1, mm4); // z2 + + paddw_r2r(mm5, mm0); // z11 + psubw_r2r(mm5, mm3); // z13 + + /* stage 6 */ + + movq_r2r(mm3, mm5); // copy z13 + paddw_r2r(mm1, mm2); // z4 + + movq_r2r(mm0, mm6); // copy z11 + psubw_r2r(mm4, mm5); // y3 + + paddw_r2r(mm2, mm6); // y1 + paddw_r2r(mm4, mm3); // y5 + + movq_r2m(mm5, *(dataptr+7)); //save y3 + psubw_r2r(mm2, mm0); // yè=z11 - z4 + + movq_r2m(mm3, *(dataptr+11)); //save y5 + + movq_r2m(mm6, *(dataptr+3)); //save y1 + + movq_r2m(mm0, *(dataptr+15)); //save y7 + + +#endif +} + +#define FIX_1_082392200 ((__s32) 277) /* FIX(1.082392200) */ +#define FIX_1_414213562 ((__s32) 362) /* FIX(1.414213562) */ +#define FIX_1_847759065 ((__s32) 473) /* FIX(1.847759065) */ +#define FIX_2_613125930 ((__s32) 669) /* FIX(2.613125930) */ + +#define DESCALE(x) (__s16)( ((x)+4) >> 3) + +/* clip yuv to 16..235 (should be 16..240 for cr/cb but ... */ + +#define RL(x) ((x)>235) ? 235 : (((x)<16) ? 16 : (x)) +#define MULTIPLY(var,const) (((__s32) ((var) * (const)) + 128)>>8) + +void RTjpeg_idct_init(void) +{ + int i; + + for(i=0; i<64; i++) + { + RTjpeg_liqt[i]=((__u64)RTjpeg_liqt[i]*RTjpeg_aan_tab[i])>>32; + RTjpeg_ciqt[i]=((__u64)RTjpeg_ciqt[i]*RTjpeg_aan_tab[i])>>32; + } +} + +void RTjpeg_idct(__u8 *odata, __s16 *data, int rskip) +{ +#ifdef MMX + +static mmx_t fix_141 = (mmx_t)(long long)0x5a825a825a825a82LL; +static mmx_t fix_184n261 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; +static mmx_t fix_184 = (mmx_t)(long long)0x7641764176417641LL; +static mmx_t fix_n184 = (mmx_t)(long long)0x896f896f896f896fLL; +static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; + + mmx_t workspace[64]; + mmx_t *wsptr = workspace; + register mmx_t *dataptr = (mmx_t *)odata; + mmx_t *idata = (mmx_t *)data; + + rskip = rskip>>3; +/* + * Perform inverse DCT on one block of coefficients. + */ + + /* Odd part */ + + movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5] + + movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3] + + movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1] + + movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */ + + movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7] + + paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5; + + psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5 + + psllw_i2r(2, mm2); // shift z10 + movq_r2r(mm2, mm0); // copy z10 + + pmulhw_m2r(fix_184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */ + movq_r2r(mm3, mm5); // copy tmp4 + + pmulhw_m2r(fix_n184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */ + paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7; + + movq_r2r(mm3, mm6); // copy z11 /* phase 5 */ + psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7; + + psubw_r2r(mm1, mm6); // z11-z13 + psllw_i2r(2, mm5); // shift z12 + + movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part + movq_r2r(mm5, mm7); // copy z12 + + pmulhw_m2r(fix_108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part + paddw_r2r(mm1, mm3); // tmp7 = z11 + z13; + + //ok + + /* Even part */ + pmulhw_m2r(fix_184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */ + psllw_i2r(2, mm6); + + movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2] + + paddw_r2r(mm5, mm0); // tmp10 + + paddw_r2r(mm7, mm2); // tmp12 + + pmulhw_m2r(fix_141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ + psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7 + + movq_r2r(mm1, mm5); // copy tmp1 + paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */ + + psubw_r2r(mm4, mm5); // tmp1-tmp3 + psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6; + + movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace + psllw_i2r(2, mm5); // shift tmp1-tmp3 + + movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0] + + pmulhw_m2r(fix_141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562) + paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5; + + movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4] + + psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ + + movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace + movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */ + + movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace + psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2; + + paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2; + movq_r2r(mm1, mm5); // copy tmp11 + + paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12; + movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */ + + paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13; + + psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13; + movq_r2r(mm7, mm0); // copy tmp0 + + psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12; + paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); + + psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); + + movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0] + movq_r2r(mm1, mm3); // copy tmp1 + + movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7] + paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); + + psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); + + movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1] + movq_r2r(mm4, mm1); // copy tmp3 + + movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6] + + paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); + + psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); + + movq_r2m(mm4, *(wsptr+8)); + movq_r2r(mm5, mm7); // copy tmp2 + + paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) + + movq_r2m(mm1, *(wsptr+6)); + psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); + + movq_r2m(mm5, *(wsptr+4)); + + movq_r2m(mm7, *(wsptr+10)); + + //ok + + +/*****************************************************************/ + + idata++; + wsptr++; + +/*****************************************************************/ + + movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5] + + movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3] + + movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1] + movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */ + + movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7] + paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5; + + psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5 + + psllw_i2r(2, mm2); // shift z10 + movq_r2r(mm2, mm0); // copy z10 + + pmulhw_m2r(fix_184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */ + movq_r2r(mm3, mm5); // copy tmp4 + + pmulhw_m2r(fix_n184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */ + paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7; + + movq_r2r(mm3, mm6); // copy z11 /* phase 5 */ + psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7; + + psubw_r2r(mm1, mm6); // z11-z13 + psllw_i2r(2, mm5); // shift z12 + + movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part + movq_r2r(mm5, mm7); // copy z12 + + pmulhw_m2r(fix_108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part + paddw_r2r(mm1, mm3); // tmp7 = z11 + z13; + + //ok + + /* Even part */ + pmulhw_m2r(fix_184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */ + psllw_i2r(2, mm6); + + movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2] + + paddw_r2r(mm5, mm0); // tmp10 + + paddw_r2r(mm7, mm2); // tmp12 + + pmulhw_m2r(fix_141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ + psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7 + + movq_r2r(mm1, mm5); // copy tmp1 + paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */ + + psubw_r2r(mm4, mm5); // tmp1-tmp3 + psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6; + + movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace + psllw_i2r(2, mm5); // shift tmp1-tmp3 + + movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0] + paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5; + + pmulhw_m2r(fix_141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562) + + movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4] + + psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ + + movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace + movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */ + + movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace + psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2; + + paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2; + movq_r2r(mm1, mm5); // copy tmp11 + + paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12; + movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */ + + paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13; + + psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13; + movq_r2r(mm7, mm0); // copy tmp0 + + psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12; + paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); + + psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); + + movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0] + movq_r2r(mm1, mm3); // copy tmp1 + + movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7] + paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); + + psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); + + movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1] + movq_r2r(mm4, mm1); // copy tmp3 + + movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6] + + paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); + + psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); + + movq_r2m(mm4, *(wsptr+8)); + movq_r2r(mm5, mm7); // copy tmp2 + + paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) + + movq_r2m(mm1, *(wsptr+6)); + psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); + + movq_r2m(mm5, *(wsptr+4)); + + movq_r2m(mm7, *(wsptr+10)); + +/*****************************************************************/ + + /* Pass 2: process rows from work array, store into output array. */ + /* Note that we must descale the results by a factor of 8 == 2**3, */ + /* and also undo the PASS1_BITS scaling. */ + +/*****************************************************************/ + /* Even part */ + + wsptr--; + +// tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); +// tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); +// tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); +// tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); + movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3] + + movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7] + movq_r2r(mm0, mm2); + + movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3] + paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] + + movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7] + psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] + + movq_r2r(mm0, mm6); + movq_r2r(mm3, mm5); + + paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] + movq_r2r(mm2, mm1); + + psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] + punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] + + movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7] + punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] + + movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3] + punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] + + punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] + movq_r2r(mm3, mm4); + + movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3] + punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] + + movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7] + punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] + + + paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] + movq_r2r(mm6, mm2); + + psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] + paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] + + movq_r2r(mm3, mm5); + punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] + + psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] + punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] + + movq_r2r(mm4, mm7); + punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] + + punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] + + punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] + + punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] + movq_r2r(mm1, mm6); + + //ok + +// mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] +// mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] + + + movq_r2r(mm0, mm2); + punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] + + punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] + psllw_i2r(2, mm6); + + pmulhw_m2r(fix_141, mm6); + punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] + + punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] + movq_r2r(mm0, mm7); + +// tmp0 = tmp10 + tmp13; +// tmp3 = tmp10 - tmp13; + paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] + psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] + +// tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; + psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] +// tmp1 = tmp11 + tmp12; +// tmp2 = tmp11 - tmp12; + movq_r2r(mm1, mm5); + + //OK + + /* Odd part */ + +// z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; +// z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; +// z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; +// z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; + movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3] + paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] + + movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7] + psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] + + movq_r2r(mm3, mm6); + punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5] + + punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3] + movq_r2r(mm3, mm2); + +//Save tmp0 and tmp1 in wsptr + movq_r2m(mm0, *(wsptr)); // save tmp0 + paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13] + + +//Continue with z10 --- z13 + movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3] + psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10] + + movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7] + movq_r2r(mm6, mm4); + + movq_r2m(mm1, *(wsptr+1)); // save tmp1 + punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5] + + punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3] + movq_r2r(mm6, mm1); + +//Save tmp2 and tmp3 in wsptr + paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13] + movq_r2r(mm2, mm4); + +//Continue with z10 --- z13 + movq_r2m(mm5, *(wsptr+2)); // save tmp2 + punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11] + + psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10] + punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13] + + movq_r2r(mm3, mm0); + punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12] + + movq_r2m(mm7, *(wsptr+3)); // save tmp3 + punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10] + + movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3] + punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11] + + movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7] + punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13] + + movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3] + movq_r2r(mm6, mm4); + + punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5] + movq_r2r(mm1, mm5); + + punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3] + movq_r2r(mm6, mm2); + + movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7] + paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13] + + psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10] + punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5] + + punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3] + movq_r2r(mm1, mm7); + + paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13] + psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10] + + movq_r2r(mm6, mm5); + punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11] + + punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13] + movq_r2r(mm2, mm4); + + punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12] + + punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10] + + punpckhdq_r2r(mm6, mm4); /// wsptr[2,z10],[3,z10],[2,z11],[3,z11] + + punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13] + movq_r2r(mm0, mm5); + + punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10] + + punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11] + movq_r2r(mm3, mm4); + + punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13] + movq_r2r(mm5, mm1); + + punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12] +// tmp7 = z11 + z13; /* phase 5 */ +// tmp8 = z11 - z13; /* phase 5 */ + psubw_r2r(mm4, mm1); // tmp8 + + paddw_r2r(mm4, mm5); // tmp7 +// tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */ + psllw_i2r(2, mm1); + + psllw_i2r(2, mm0); + + pmulhw_m2r(fix_141, mm1); // tmp21 +// tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */ +// + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */ + psllw_i2r(2, mm3); + movq_r2r(mm0, mm7); + + pmulhw_m2r(fix_n184, mm7); + movq_r2r(mm3, mm6); + + movq_m2r(*(wsptr), mm2); // tmp0,final1 + + pmulhw_m2r(fix_108n184, mm6); +// tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */ +// + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */ + movq_r2r(mm2, mm4); // final1 + + pmulhw_m2r(fix_184n261, mm0); + paddw_r2r(mm5, mm2); // tmp0+tmp7,final1 + + pmulhw_m2r(fix_184, mm3); + psubw_r2r(mm5, mm4); // tmp0-tmp7,final1 + +// tmp6 = tmp22 - tmp7; /* phase 2 */ + psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1 + + paddw_r2r(mm6, mm7); // tmp20 + psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1 + + paddw_r2r(mm0, mm3); // tmp22 + +// tmp5 = tmp21 - tmp6; + psubw_r2r(mm5, mm3); // tmp6 + +// tmp4 = tmp20 + tmp5; + movq_m2r(*(wsptr+1), mm0); // tmp1,final2 + psubw_r2r(mm3, mm1); // tmp5 + + movq_r2r(mm0, mm6); // final2 + paddw_r2r(mm3, mm0); // tmp1+tmp6,final2 + + /* Final output stage: scale down by a factor of 8 and range-limit */ + + +// outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) +// & RANGE_MASK]; +// outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) +// & RANGE_MASK]; final1 + + +// outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) +// & RANGE_MASK]; +// outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) +// & RANGE_MASK]; final2 + psubw_r2r(mm3, mm6); // tmp1-tmp6,final2 + psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1] + + psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6] + + packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] + + movq_m2r(*(wsptr+2), mm5); // tmp2,final3 + packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] + +// outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) +// & RANGE_MASK]; +// outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) +// & RANGE_MASK]; final3 + paddw_r2r(mm1, mm7); // tmp4 + movq_r2r(mm5, mm3); + + paddw_r2r(mm1, mm5); // tmp2+tmp5 + psubw_r2r(mm1, mm3); // tmp2-tmp5 + + psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2] + + movq_m2r(*(wsptr+3), mm4); // tmp3,final4 + psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5] + + + +// outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) +// & RANGE_MASK]; +// outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) +// & RANGE_MASK]; final4 + movq_r2r(mm4, mm6); + paddw_r2r(mm7, mm4); // tmp3+tmp4 + + psubw_r2r(mm7, mm6); // tmp3-tmp4 + psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4] + + // mov ecx, [dataptr] + + psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3] + + packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] + + packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] + movq_r2r(mm2, mm4); + + movq_r2r(mm5, mm7); + punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] + + punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] + movq_r2r(mm2, mm1); + + punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] + + // add dataptr, 4 + + punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] + + punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] + + // add ecx, output_col + + movq_r2r(mm7, mm6); + punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] + + movq_r2r(mm2, mm0); + punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] + + // mov idata, [dataptr] + + punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] + + // add dataptr, 4 + + movq_r2r(mm1, mm3); + + // add idata, output_col + + punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] + + movq_r2m(mm2, *(dataptr)); + + punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] + + dataptr += rskip; + movq_r2m(mm0, *(dataptr)); + + punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] + punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] + + dataptr += rskip; + movq_r2m(mm1, *(dataptr)); + + dataptr += rskip; + movq_r2m(mm3, *(dataptr)); + +/*******************************************************************/ + + wsptr += 8; + +/*******************************************************************/ + +// tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); +// tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); +// tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); +// tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); + movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3] + + movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7] + movq_r2r(mm0, mm2); + + movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3] + paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] + + movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7] + psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] + + movq_r2r(mm0, mm6); + movq_r2r(mm3, mm5); + + paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] + movq_r2r(mm2, mm1); + + psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] + punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] + + movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7] + punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] + + movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3] + punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] + + punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] + movq_r2r(mm3, mm4); + + movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3] + punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] + + movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7] + punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] + + paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] + movq_r2r(mm6, mm2); + + psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] + paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] + + movq_r2r(mm3, mm5); + punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] + + psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] + punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] + + movq_r2r(mm4, mm7); + punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] + + punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] + + punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] + + punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] + movq_r2r(mm1, mm6); + + //OK + +// mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] +// mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] + + movq_r2r(mm0, mm2); + punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] + + punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] + psllw_i2r(2, mm6); + + pmulhw_m2r(fix_141, mm6); + punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] + + punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] + movq_r2r(mm0, mm7); + +// tmp0 = tmp10 + tmp13; +// tmp3 = tmp10 - tmp13; + paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] + psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] + +// tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; + psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] +// tmp1 = tmp11 + tmp12; +// tmp2 = tmp11 - tmp12; + movq_r2r(mm1, mm5); + + //OK + + + /* Odd part */ + +// z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; +// z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; +// z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; +// z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; + movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3] + paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] + + movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7] + psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] + + movq_r2r(mm3, mm6); + punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5] + + punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3] + movq_r2r(mm3, mm2); + +//Save tmp0 and tmp1 in wsptr + movq_r2m(mm0, *(wsptr)); // save tmp0 + paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13] + + +//Continue with z10 --- z13 + movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3] + psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10] + + movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7] + movq_r2r(mm6, mm4); + + movq_r2m(mm1, *(wsptr+1)); // save tmp1 + punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5] + + punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3] + movq_r2r(mm6, mm1); + +//Save tmp2 and tmp3 in wsptr + paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13] + movq_r2r(mm2, mm4); + +//Continue with z10 --- z13 + movq_r2m(mm5, *(wsptr+2)); // save tmp2 + punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11] + + psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10] + punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13] + + movq_r2r(mm3, mm0); + punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12] + + movq_r2m(mm7, *(wsptr+3)); // save tmp3 + punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10] + + movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3] + punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11] + + movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7] + punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13] + + movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3] + movq_r2r(mm6, mm4); + + punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5] + movq_r2r(mm1, mm5); + + punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3] + movq_r2r(mm6, mm2); + + movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7] + paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13] + + psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10] + punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5] + + punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3] + movq_r2r(mm1, mm7); + + paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13] + psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10] + + movq_r2r(mm6, mm5); + punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11] + + punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13] + movq_r2r(mm2, mm4); + + punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12] + + punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10] + + punpckhdq_r2r(mm6, mm4); // wsptr[2,z10],[3,z10],[2,z11],[3,z11] + + punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13] + movq_r2r(mm0, mm5); + + punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10] + + punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11] + movq_r2r(mm3, mm4); + + punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13] + movq_r2r(mm5, mm1); + + punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12] +// tmp7 = z11 + z13; /* phase 5 */ +// tmp8 = z11 - z13; /* phase 5 */ + psubw_r2r(mm4, mm1); // tmp8 + + paddw_r2r(mm4, mm5); // tmp7 +// tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */ + psllw_i2r(2, mm1); + + psllw_i2r(2, mm0); + + pmulhw_m2r(fix_141, mm1); // tmp21 +// tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */ +// + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */ + psllw_i2r(2, mm3); + movq_r2r(mm0, mm7); + + pmulhw_m2r(fix_n184, mm7); + movq_r2r(mm3, mm6); + + movq_m2r(*(wsptr), mm2); // tmp0,final1 + + pmulhw_m2r(fix_108n184, mm6); +// tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */ +// + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */ + movq_r2r(mm2, mm4); // final1 + + pmulhw_m2r(fix_184n261, mm0); + paddw_r2r(mm5, mm2); // tmp0+tmp7,final1 + + pmulhw_m2r(fix_184, mm3); + psubw_r2r(mm5, mm4); // tmp0-tmp7,final1 + +// tmp6 = tmp22 - tmp7; /* phase 2 */ + psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1 + + paddw_r2r(mm6, mm7); // tmp20 + psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1 + + paddw_r2r(mm0, mm3); // tmp22 + +// tmp5 = tmp21 - tmp6; + psubw_r2r(mm5, mm3); // tmp6 + +// tmp4 = tmp20 + tmp5; + movq_m2r(*(wsptr+1), mm0); // tmp1,final2 + psubw_r2r(mm3, mm1); // tmp5 + + movq_r2r(mm0, mm6); // final2 + paddw_r2r(mm3, mm0); // tmp1+tmp6,final2 + + /* Final output stage: scale down by a factor of 8 and range-limit */ + +// outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) +// & RANGE_MASK]; +// outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) +// & RANGE_MASK]; final1 + + +// outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) +// & RANGE_MASK]; +// outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) +// & RANGE_MASK]; final2 + psubw_r2r(mm3, mm6); // tmp1-tmp6,final2 + psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1] + + psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6] + + packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] + + movq_m2r(*(wsptr+2), mm5); // tmp2,final3 + packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] + +// outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) +// & RANGE_MASK]; +// outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) +// & RANGE_MASK]; final3 + paddw_r2r(mm1, mm7); // tmp4 + movq_r2r(mm5, mm3); + + paddw_r2r(mm1, mm5); // tmp2+tmp5 + psubw_r2r(mm1, mm3); // tmp2-tmp5 + + psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2] + + movq_m2r(*(wsptr+3), mm4); // tmp3,final4 + psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5] + + + +// outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) +// & RANGE_MASK]; +// outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) +// & RANGE_MASK]; final4 + movq_r2r(mm4, mm6); + paddw_r2r(mm7, mm4); // tmp3+tmp4 + + psubw_r2r(mm7, mm6); // tmp3-tmp4 + psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4] + + psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3] + + /* + movq_r2m(mm4, *dummy); + fprintf(stderr, "3-4 %016llx\n", dummy); + movq_r2m(mm4, *dummy); + fprintf(stderr, "3+4 %016llx\n", dummy); + */ + + + packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] + + packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] + movq_r2r(mm2, mm4); + + movq_r2r(mm5, mm7); + punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] + + punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] + movq_r2r(mm2, mm1); + + punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] + + punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] + + punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] + + movq_r2r(mm7, mm6); + punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] + + movq_r2r(mm2, mm0); + punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] + + punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] + + movq_r2r(mm1, mm3); + + punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] + + dataptr += rskip; + movq_r2m(mm2, *(dataptr)); + + punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] + + dataptr += rskip; + movq_r2m(mm0, *(dataptr)); + + punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] + + punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] + + dataptr += rskip; + movq_r2m(mm1, *(dataptr)); + + dataptr += rskip; + movq_r2m(mm3, *(dataptr)); + +#else + __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __s32 tmp10, tmp11, tmp12, tmp13; + __s32 z5, z10, z11, z12, z13; + __s16 *inptr; + __s32 *wsptr; + __u8 *outptr; + int ctr; + __s32 dcval; + __s32 workspace[64]; + + inptr = data; + wsptr = workspace; + for (ctr = 8; ctr > 0; ctr--) { + + if ((inptr[8] | inptr[16] | inptr[24] | + inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) { + dcval = inptr[0]; + wsptr[0] = dcval; + wsptr[8] = dcval; + wsptr[16] = dcval; + wsptr[24] = dcval; + wsptr[32] = dcval; + wsptr[40] = dcval; + wsptr[48] = dcval; + wsptr[56] = dcval; + + inptr++; + wsptr++; + continue; + } + + tmp0 = inptr[0]; + tmp1 = inptr[16]; + tmp2 = inptr[32]; + tmp3 = inptr[48]; + + tmp10 = tmp0 + tmp2; + tmp11 = tmp0 - tmp2; + + tmp13 = tmp1 + tmp3; + tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; + + tmp0 = tmp10 + tmp13; + tmp3 = tmp10 - tmp13; + tmp1 = tmp11 + tmp12; + tmp2 = tmp11 - tmp12; + + tmp4 = inptr[8]; + tmp5 = inptr[24]; + tmp6 = inptr[40]; + tmp7 = inptr[56]; + + z13 = tmp6 + tmp5; + z10 = tmp6 - tmp5; + z11 = tmp4 + tmp7; + z12 = tmp4 - tmp7; + + tmp7 = z11 + z13; + tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); + + z5 = MULTIPLY(z10 + z12, FIX_1_847759065); + tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; + tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; + + tmp6 = tmp12 - tmp7; + tmp5 = tmp11 - tmp6; + tmp4 = tmp10 + tmp5; + + wsptr[0] = (__s32) (tmp0 + tmp7); + wsptr[56] = (__s32) (tmp0 - tmp7); + wsptr[8] = (__s32) (tmp1 + tmp6); + wsptr[48] = (__s32) (tmp1 - tmp6); + wsptr[16] = (__s32) (tmp2 + tmp5); + wsptr[40] = (__s32) (tmp2 - tmp5); + wsptr[32] = (__s32) (tmp3 + tmp4); + wsptr[24] = (__s32) (tmp3 - tmp4); + + inptr++; + wsptr++; + } + + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++) { + outptr = &(odata[ctr*rskip]); + + tmp10 = wsptr[0] + wsptr[4]; + tmp11 = wsptr[0] - wsptr[4]; + + tmp13 = wsptr[2] + wsptr[6]; + tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13; + + tmp0 = tmp10 + tmp13; + tmp3 = tmp10 - tmp13; + tmp1 = tmp11 + tmp12; + tmp2 = tmp11 - tmp12; + + z13 = wsptr[5] + wsptr[3]; + z10 = wsptr[5] - wsptr[3]; + z11 = wsptr[1] + wsptr[7]; + z12 = wsptr[1] - wsptr[7]; + + tmp7 = z11 + z13; + tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); + + z5 = MULTIPLY(z10 + z12, FIX_1_847759065); + tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; + tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; + + tmp6 = tmp12 - tmp7; + tmp5 = tmp11 - tmp6; + tmp4 = tmp10 + tmp5; + + outptr[0] = RL(DESCALE(tmp0 + tmp7)); + outptr[7] = RL(DESCALE(tmp0 - tmp7)); + outptr[1] = RL(DESCALE(tmp1 + tmp6)); + outptr[6] = RL(DESCALE(tmp1 - tmp6)); + outptr[2] = RL(DESCALE(tmp2 + tmp5)); + outptr[5] = RL(DESCALE(tmp2 - tmp5)); + outptr[4] = RL(DESCALE(tmp3 + tmp4)); + outptr[3] = RL(DESCALE(tmp3 - tmp4)); + + wsptr += 8; + } +#endif +} +/* + +Main Routines + +This file contains most of the initialisation and control functions + +(C) Justin Schoeman 1998 + +*/ + +/* + +Private function + +Initialise all the cache-aliged data blocks + +*/ + +void RTjpeg_init_data(void) +{ + unsigned long dptr; + + dptr=(unsigned long)&(RTjpeg_alldata[0]); + dptr+=32; + dptr=dptr>>5; + dptr=dptr<<5; /* cache align data */ + + RTjpeg_block=(__s16 *)dptr; + dptr+=sizeof(__s16)*64; + RTjpeg_lqt=(__s32 *)dptr; + dptr+=sizeof(__s32)*64; + RTjpeg_cqt=(__s32 *)dptr; + dptr+=sizeof(__s32)*64; + RTjpeg_liqt=(__u32 *)dptr; + dptr+=sizeof(__u32)*64; + RTjpeg_ciqt=(__u32 *)dptr; +} + +/* + +External Function + +Re-set quality factor + +Input: buf -> pointer to 128 ints for quant values store to pass back to + init_decompress. + Q -> quality factor (192=best, 32=worst) +*/ + +void RTjpeg_init_Q(__u8 Q) +{ + int i; + __u64 qual; + + qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */ + + for(i=0; i<64; i++) + { + RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3); + if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1; + RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3); + if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1; + RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3); + RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3); + RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3; + RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3; + } + + RTjpeg_lb8=0; + while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8); + RTjpeg_lb8--; + RTjpeg_cb8=0; + while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8); + RTjpeg_cb8--; + + RTjpeg_dct_init(); + RTjpeg_idct_init(); + RTjpeg_quant_init(); +} + +/* + +External Function + +Initialise compression. + +Input: buf -> pointer to 128 ints for quant values store to pass back to + init_decompress. + width -> width of image + height -> height of image + Q -> quality factor (192=best, 32=worst) + +*/ + +void RTjpeg_init_compress(__u32 *buf, int width, int height, __u8 Q) +{ + int i; + __u64 qual; + + RTjpeg_init_data(); + + RTjpeg_width=width; + RTjpeg_height=height; + RTjpeg_Ywidth = RTjpeg_width>>3; + RTjpeg_Ysize=width * height; + RTjpeg_Cwidth = RTjpeg_width>>4; + RTjpeg_Csize= (width>>1) * height; + + qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */ + + for(i=0; i<64; i++) + { + RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3); + if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1; + RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3); + if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1; + RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3); + RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3); + RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3; + RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3; + } + + RTjpeg_lb8=0; + while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8); + RTjpeg_lb8--; + RTjpeg_cb8=0; + while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8); + RTjpeg_cb8--; + + RTjpeg_dct_init(); + RTjpeg_quant_init(); + + for(i=0; i<64; i++) + buf[i]=RTjpeg_liqt[i]; + for(i=0; i<64; i++) + buf[64+i]=RTjpeg_ciqt[i]; +} + +void RTjpeg_init_decompress(__u32 *buf, int width, int height) +{ + int i; + + RTjpeg_init_data(); + + RTjpeg_width=width; + RTjpeg_height=height; + RTjpeg_Ywidth = RTjpeg_width>>3; + RTjpeg_Ysize=width * height; + RTjpeg_Cwidth = RTjpeg_width>>4; + RTjpeg_Csize= (width>>1) * height; + + for(i=0; i<64; i++) + { + RTjpeg_liqt[i]=buf[i]; + RTjpeg_ciqt[i]=buf[i+64]; + } + + RTjpeg_lb8=0; + while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8); + RTjpeg_lb8--; + RTjpeg_cb8=0; + while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8); + RTjpeg_cb8--; + + RTjpeg_idct_init(); + +// RTjpeg_color_init(); +} + +int RTjpeg_compressYUV420(__s8 *sp, unsigned char *bp) +{ + __s8 * sb; + register __s8 * bp1 = bp + (RTjpeg_width<<3); + register __s8 * bp2 = bp + RTjpeg_Ysize; + register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1); + register int i, j, k; + +#ifdef MMX + emms(); +#endif + sb=sp; +/* Y */ + for(i=RTjpeg_height>>1; i; i-=8) + { + for(j=0, k=0; j>1); + } + if(*sp==-1)sp++; + else + { + sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt); + RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1); + } + } + bp+=RTjpeg_width<<3; + bp2+=RTjpeg_width<<2; + bp3+=RTjpeg_width<<2; + } +#ifdef MMX + emms(); +#endif +} + +void RTjpeg_decompressYUV420(__s8 *sp, __u8 *bp) +{ + register __s8 * bp1 = bp + (RTjpeg_width<<3); + register __s8 * bp2 = bp + RTjpeg_Ysize; + register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1); + int i, j,k; + +#ifdef MMX + emms(); +#endif + +/* Y */ + for(i=RTjpeg_height>>1; i; i-=8) + { + for(k=0, j=0; j>1); + } + if(*sp==-1)sp++; + else + { + sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt); + RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1); + } + } + bp+=RTjpeg_width<<4; + bp1+=RTjpeg_width<<4; + bp2+=RTjpeg_width<<2; + bp3+=RTjpeg_width<<2; + } +#ifdef MMX + emms(); +#endif +} + +void RTjpeg_decompress8(__s8 *sp, __u8 *bp) +{ + int i, j; + +#ifdef MMX + emms(); +#endif + +/* Y */ + for(i=0; i>5; + RTjpeg_old=(__s16 *)(tmp<<5); + } + if (!RTjpeg_old) + { + fprintf(stderr, "RTjpeg: Could not allocate memory\n"); + exit(-1); + } + bzero(RTjpeg_old, ((4*RTjpeg_width*RTjpeg_height))); +} + +#ifdef MMX + +int RTjpeg_bcomp(__s16 *old, mmx_t *mask) +{ + int i; + mmx_t *mold=(mmx_t *)old; + mmx_t *mblock=(mmx_t *)RTjpeg_block; + volatile mmx_t result; + static mmx_t neg=(mmx_t)(unsigned long long)0xffffffffffffffffULL; + + movq_m2r(*mask, mm7); + movq_m2r(neg, mm6); + pxor_r2r(mm5, mm5); + + for(i=0; i<8; i++) + { + movq_m2r(*(mblock++), mm0); + movq_m2r(*(mblock++), mm2); + movq_m2r(*(mold++), mm1); + movq_m2r(*(mold++), mm3); + psubsw_r2r(mm1, mm0); + psubsw_r2r(mm3, mm2); + movq_r2r(mm0, mm1); + movq_r2r(mm2, mm3); + pcmpgtw_r2r(mm7, mm0); + pcmpgtw_r2r(mm7, mm2); + pxor_r2r(mm6, mm1); + pxor_r2r(mm6, mm3); + pcmpgtw_r2r(mm7, mm1); + pcmpgtw_r2r(mm7, mm3); + por_r2r(mm0, mm5); + por_r2r(mm2, mm5); + por_r2r(mm1, mm5); + por_r2r(mm3, mm5); + } + movq_r2m(mm5, result); + + if(result.q) + { +// if(!RTjpeg_mtest) +// for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i]; + return 0; + } +// printf("."); + return 1; +} + +#else +int RTjpeg_bcomp(__s16 *old, __u16 *mask) +{ + int i; + + for(i=0; i<64; i++) + if(abs(old[i]-RTjpeg_block[i])>*mask) + { + if(!RTjpeg_mtest) + for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i]; + return 0; + } + return 1; +} +#endif + +void RTjpeg_set_test(int i) +{ + RTjpeg_mtest=i; +} + +int RTjpeg_mcompressYUV420(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask) +{ + __s8 * sb; +//rh __s16 *block; + register __s8 * bp1 = bp + (RTjpeg_width<<3); + register __s8 * bp2 = bp + RTjpeg_Ysize; + register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1); + register int i, j, k; + +#ifdef MMX + emms(); + RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask); + RTjpeg_cmask=(mmx_t)(((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask); +#else + RTjpeg_lmask=lmask; + RTjpeg_cmask=cmask; +#endif + + sb=sp; + block=RTjpeg_old; +/* Y */ + for(i=RTjpeg_height>>1; i; i-=8) + { + for(j=0, k=0; j>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+cbB)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + + y=(bufy[j+1]-16)*Ky; + + tmp=(y+crR)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+cbB)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + + } + bufy+=yskip; + } +} + + +void RTjpeg_yuv420rgb(__u8 *buf, __u8 *rgb, int stride) +{ + int tmp; + int i, j; + __s32 y, crR, crG, cbG, cbB; + __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; + int oskip, yskip; + + if(stride==0) + oskip=RTjpeg_width*3; + else + oskip=2*stride-RTjpeg_width*3; + + yskip=RTjpeg_width; + + bufcb=&buf[RTjpeg_width*RTjpeg_height]; + bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4]; + bufy=&buf[0]; + bufoute=rgb; + bufouto=rgb+RTjpeg_width*3; + + for(i=0; i<(RTjpeg_height>>1); i++) + { + for(j=0; j>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+cbB)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + + y=(bufy[j+1]-16)*Ky; + + tmp=(y+crR)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+cbB)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + + y=(bufy[j+yskip]-16)*Ky; + + tmp=(y+crR)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+cbB)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + + y=(bufy[j+1+yskip]-16)*Ky; + + tmp=(y+crR)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+cbB)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + + } + bufoute+=oskip; + bufouto+=oskip; + bufy+=yskip<<1; + } +} + + +void RTjpeg_yuvrgb32(__u8 *buf, __u8 *rgb, int stride) +{ + int tmp; + int i, j; + __s32 y, crR, crG, cbG, cbB; + __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; + int oskip, yskip; + + if(stride==0) + oskip=RTjpeg_width*4; + else + oskip = 2*stride-RTjpeg_width*4; + yskip=RTjpeg_width; + + bufcb=&buf[RTjpeg_width*RTjpeg_height]; + bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2]; + bufy=&buf[0]; + bufoute=rgb; + bufouto=rgb+RTjpeg_width*4; + + for(i=0; i<(RTjpeg_height>>1); i++) + { + for(j=0; j>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+crR)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + bufoute++; + + y=(bufy[j+1]-16)*Ky; + + tmp=(y+cbB)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+crR)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + bufoute++; + + y=(bufy[j+yskip]-16)*Ky; + + tmp=(y+cbB)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+crR)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + bufouto++; + + y=(bufy[j+1+yskip]-16)*Ky; + + tmp=(y+cbB)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+crR)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + bufouto++; + + } + bufoute+=oskip; + bufouto+=oskip; + bufy+=yskip<<1; + } +} + +void RTjpeg_yuvrgb24(__u8 *buf, __u8 *rgb, int stride) +{ + int tmp; + int i, j; + __s32 y, crR, crG, cbG, cbB; + __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; + int oskip, yskip; + + if(stride==0) + oskip=RTjpeg_width*3; + else + oskip=2*stride - RTjpeg_width*3; + + yskip=RTjpeg_width; + + bufcb=&buf[RTjpeg_width*RTjpeg_height]; + bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4]; + bufy=&buf[0]; + bufoute=rgb; + bufouto=rgb+RTjpeg_width*3; + + for(i=0; i<(RTjpeg_height>>1); i++) + { + for(j=0; j>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+crR)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + + y=(bufy[j+1]-16)*Ky; + + tmp=(y+cbB)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+crR)>>16; + *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); + + y=(bufy[j+yskip]-16)*Ky; + + tmp=(y+cbB)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+crR)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + + y=(bufy[j+1+yskip]-16)*Ky; + + tmp=(y+cbB)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+crR)>>16; + *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); + + } + bufoute+=oskip; + bufouto+=oskip; + bufy+=yskip<<1; + } +} + +void RTjpeg_yuvrgb16(__u8 *buf, __u8 *rgb, int stride) +{ + int tmp; + int i, j; + __s32 y, crR, crG, cbG, cbB; + __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; + int oskip, yskip; + unsigned char r, g, b; + + if(stride==0) + oskip=RTjpeg_width*2; + else + oskip=2*stride-RTjpeg_width*2; + + yskip=RTjpeg_width; + + bufcb=&buf[RTjpeg_width*RTjpeg_height]; + bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4]; + bufy=&buf[0]; + bufoute=rgb; + bufouto=rgb+RTjpeg_width*2; + + for(i=0; i<(RTjpeg_height>>1); i++) + { + for(j=0; j>16; + b=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + g=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+crR)>>16; + r=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(int)((int)b >> 3); + tmp|=(int)(((int)g >> 2) << 5); + tmp|=(int)(((int)r >> 3) << 11); + *(bufoute++)=tmp&0xff; + *(bufoute++)=tmp>>8; + + + y=(bufy[j+1]-16)*Ky; + + tmp=(y+cbB)>>16; + b=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + g=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+crR)>>16; + r=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(int)((int)b >> 3); + tmp|=(int)(((int)g >> 2) << 5); + tmp|=(int)(((int)r >> 3) << 11); + *(bufoute++)=tmp&0xff; + *(bufoute++)=tmp>>8; + + y=(bufy[j+yskip]-16)*Ky; + + tmp=(y+cbB)>>16; + b=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + g=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+crR)>>16; + r=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(int)((int)b >> 3); + tmp|=(int)(((int)g >> 2) << 5); + tmp|=(int)(((int)r >> 3) << 11); + *(bufouto++)=tmp&0xff; + *(bufouto++)=tmp>>8; + + y=(bufy[j+1+yskip]-16)*Ky; + + tmp=(y+cbB)>>16; + b=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y-crG-cbG)>>16; + g=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(y+crR)>>16; + r=(tmp>255)?255:((tmp<0)?0:tmp); + tmp=(int)((int)b >> 3); + tmp|=(int)(((int)g >> 2) << 5); + tmp|=(int)(((int)r >> 3) << 11); + *(bufouto++)=tmp&0xff; + *(bufouto++)=tmp>>8; + + } + bufoute+=oskip; + bufouto+=oskip; + bufy+=yskip<<1; + } +} + +/* fix stride */ + +void RTjpeg_yuvrgb8(__u8 *buf, __u8 *rgb, int stride) +{ + bcopy(buf, rgb, RTjpeg_width*RTjpeg_height); +} + diff -r 3cea69dda1b3 -r bf37d5cd3e6e RTjpegN.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RTjpegN.h Thu Dec 27 21:27:29 2001 +0000 @@ -0,0 +1,58 @@ +/* + RTjpeg (C) Justin Schoeman 1998 (justin@suntiger.ee.up.ac.za) + + With modifications by: + (c) 1998, 1999 by Joerg Walter + and + (c) 1999 by Wim Taymans + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#ifndef _I386_TYPES_H +typedef unsigned char __u8; +typedef unsigned short __u16; +typedef unsigned long __u32; +typedef unsigned long long __u64; +typedef signed char __s8; +typedef signed short __s16; +typedef signed long __s32; +#endif + +extern void RTjpeg_init_Q(__u8 Q); +extern void RTjpeg_init_compress(long unsigned int *buf, int width, int height, __u8 Q); +extern void RTjpeg_init_decompress(long unsigned int *buf, int width, int height); +extern int RTjpeg_compressYUV420(__s8 *sp, unsigned char *bp); +extern int RTjpeg_compressYUV422(__s8 *sp, unsigned char *bp); +extern void RTjpeg_decompressYUV420(__s8 *sp, __u8 *bp); +extern void RTjpeg_decompressYUV422(__s8 *sp, __u8 *bp); +extern int RTjpeg_compress8(__s8 *sp, unsigned char *bp); +extern void RTjpeg_decompress8(__s8 *sp, __u8 *bp); + +extern void RTjpeg_init_mcompress(void); +extern int RTjpeg_mcompressYUV420(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask); +extern int RTjpeg_mcompressYUV422(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask); +extern int RTjpeg_mcompress8(__s8 *sp, unsigned char *bp, __u16 lmask); +extern void RTjpeg_set_test(int i); + +extern void RTjpeg_yuv420rgb(__u8 *buf, __u8 *rgb, int stride); +extern void RTjpeg_yuv422rgb(__u8 *buf, __u8 *rgb, int stride); +extern void RTjpeg_yuvrgb8(__u8 *buf, __u8 *rgb, int stride); +extern void RTjpeg_yuvrgb16(__u8 *buf, __u8 *rgb, int stride); +extern void RTjpeg_yuvrgb24(__u8 *buf, __u8 *rgb, int stride); +extern void RTjpeg_yuvrgb32(__u8 *buf, __u8 *rgb, int stride); + +