Mercurial > mplayer.hg
view libmpcodecs/native/rtjpegn.c @ 31597:1eb8dc8f96fa
Make subdelay handling work the same way for all subtitle types and also allow
changing subtitle delay to work better with vobsubs.
This probably breaks vobsub behaviour with timestamp wrapping though.
author | reimar |
---|---|
date | Sat, 10 Jul 2010 12:53:05 +0000 |
parents | 0f1b5b68af32 |
children |
line wrap: on
line source
/* RTjpeg (C) Justin Schoeman 1998 (justin@suntiger.ee.up.ac.za) With modifications by: (c) 1998, 1999 by Joerg Walter <trouble@moes.pmnet.uni-oldenburg.de> and (c) 1999 by Wim Taymans <wim.taymans@tvd.be> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include "config.h" #include "mpbswap.h" #include "rtjpegn.h" #if HAVE_MMX #include "mmx.h" #endif //#define SHOWBLOCK 1 #define BETTERCOMPRESSION 1 static const unsigned char RTjpeg_ZZ[64]={ 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40, 33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35, 28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30, 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63 }; static const __u64 RTjpeg_aan_tab[64]={ 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL, 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL, 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL, 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL, 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL, 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL, 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL, 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL, }; #if !HAVE_MMX static __s32 RTjpeg_ws[64+31]; #endif static __u8 RTjpeg_alldata[2*64+4*64+4*64+4*64+4*64+32]; static __s16 *block; // rh static __s16 *RTjpeg_block; static __s32 *RTjpeg_lqt; static __s32 *RTjpeg_cqt; static __u32 *RTjpeg_liqt; static __u32 *RTjpeg_ciqt; static unsigned char RTjpeg_lb8; static unsigned char RTjpeg_cb8; static int RTjpeg_width, RTjpeg_height; static int RTjpeg_Ywidth, RTjpeg_Cwidth; static int RTjpeg_Ysize, RTjpeg_Csize; static __s16 *RTjpeg_old=NULL; #if HAVE_MMX static mmx_t RTjpeg_lmask; static mmx_t RTjpeg_cmask; #else static __u16 RTjpeg_lmask; static __u16 RTjpeg_cmask; #endif static const unsigned char RTjpeg_lum_quant_tbl[64] = { 16, 11, 10, 16, 24, 40, 51, 61, 12, 12, 14, 19, 26, 58, 60, 55, 14, 13, 16, 24, 40, 57, 69, 56, 14, 17, 22, 29, 51, 87, 80, 62, 18, 22, 37, 56, 68, 109, 103, 77, 24, 35, 55, 64, 81, 104, 113, 92, 49, 64, 78, 87, 103, 121, 120, 101, 72, 92, 95, 98, 112, 100, 103, 99 }; static const unsigned char RTjpeg_chrom_quant_tbl[64] = { 17, 18, 24, 47, 99, 99, 99, 99, 18, 21, 26, 66, 99, 99, 99, 99, 24, 26, 56, 99, 99, 99, 99, 99, 47, 66, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99 }; #ifdef BETTERCOMPRESSION /*--------------------------------------------------*/ /* better encoding, but needs a lot more cpu time */ /* seems to be more effective than old method +lzo */ /* with this encoding lzo isn't efficient anymore */ /* there is still more potential for better */ /* encoding but that would need even more cputime */ /* anyway your mileage may vary */ /* */ /* written by Martin BIELY and Roman HOCHLEITNER */ /*--------------------------------------------------*/ /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* Block to Stream (encoding) */ /* */ static int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8) { register int ci, co=1; register __s16 ZZvalue; register unsigned char bitten; register unsigned char bitoff; #ifdef SHOWBLOCK int ii; for (ii=0; ii < 64; ii++) { fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]); } fprintf(stdout, "\n\n"); #endif // first byte allways written ((__u8*)strm)[0]= (__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]); ci=63; while (data[RTjpeg_ZZ[ci]]==0 && ci>0) ci--; bitten = ((unsigned char)ci) << 2; if (ci==0) { ((__u8*)strm)[1]= bitten; co = 2; return (int)co; } /* bitoff=0 because the high 6bit contain first non zero position */ bitoff = 0; co = 1; for(; ci>0; ci--) { ZZvalue = data[RTjpeg_ZZ[ci]]; switch(ZZvalue) { case 0: break; case 1: bitten |= (0x01<<bitoff); break; case -1: bitten |= (0x03<<bitoff); break; default: bitten |= (0x02<<bitoff); goto HERZWEH; break; } if( bitoff == 0 ) { ((__u8*)strm)[co]= bitten; bitten = 0; bitoff = 8; co++; } /* "fall through" */ bitoff-=2; } /* ci must be 0 */ if(bitoff != 6) { ((__u8*)strm)[co]= bitten; co++; } goto BAUCHWEH; HERZWEH: /* ci cannot be 0 */ /* correct bitoff to nibble boundaries */ switch(bitoff){ case 4: case 6: bitoff = 0; break; case 2: case 0: ((__u8*)strm)[co]= bitten; bitoff = 4; co++; bitten = 0; // clear half nibble values in bitten break; default: break; } for(; ci>0; ci--) { ZZvalue = data[RTjpeg_ZZ[ci]]; if( (ZZvalue > 7) || (ZZvalue < -7) ) { bitten |= (0x08<<bitoff); goto HIRNWEH; } bitten |= (ZZvalue&0xf)<<bitoff; if( bitoff == 0 ) { ((__u8*)strm)[co]= bitten; bitten = 0; bitoff = 8; co++; } /* "fall thru" */ bitoff-=4; } /* ci must be 0 */ if( bitoff == 0 ) { ((__u8*)strm)[co]= bitten; co++; } goto BAUCHWEH; HIRNWEH: ((__u8*)strm)[co]= bitten; co++; /* bitting is over now we bite */ for(; ci>0; ci--) { ZZvalue = data[RTjpeg_ZZ[ci]]; if(ZZvalue>0) { strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue; } else { strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue; } } BAUCHWEH: /* we gotoo much now we are ill */ #ifdef SHOWBLOCK { int i; fprintf(stdout, "\nco = '%d'\n", co); for (i=0; i < co+2; i++) { fprintf(stdout, "%d ", strm[i]); } fprintf(stdout, "\n\n"); } #endif return (int)co; } #else static int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8) { register int ci, co=1, tmp; register __s16 ZZvalue; #ifdef SHOWBLOCK int ii; for (ii=0; ii < 64; ii++) { fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]); } fprintf(stdout, "\n\n"); #endif (__u8)strm[0]=(__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]); for(ci=1; ci<=bt8; ci++) { ZZvalue = data[RTjpeg_ZZ[ci]]; if(ZZvalue>0) { strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue; } else { strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue; } } for(; ci<64; ci++) { ZZvalue = data[RTjpeg_ZZ[ci]]; if(ZZvalue>0) { strm[co++]=(__s8)(ZZvalue>63)?63:ZZvalue; } else if(ZZvalue<0) { strm[co++]=(__s8)(ZZvalue<-64)?-64:ZZvalue; } else /* compress zeros */ { tmp=ci; do { ci++; } while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0)); strm[co++]=(__s8)(63+(ci-tmp)); ci--; } } return (int)co; } static int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl) { int ci=1, co=1, tmp; register int i; i=RTjpeg_ZZ[0]; data[i]=((__u8)strm[0])*qtbl[i]; for(co=1; co<=bt8; co++) { i=RTjpeg_ZZ[co]; data[i]=strm[ci++]*qtbl[i]; } for(; co<64; co++) { if(strm[ci]>63) { tmp=co+strm[ci]-63; for(; co<tmp; co++)data[RTjpeg_ZZ[co]]=0; co--; } else { i=RTjpeg_ZZ[co]; data[i]=strm[ci]*qtbl[i]; } ci++; } return (int)ci; } #endif #if HAVE_MMX static void RTjpeg_quant_init(void) { int i; __s16 *qtbl; qtbl=(__s16 *)RTjpeg_lqt; for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_lqt[i]; qtbl=(__s16 *)RTjpeg_cqt; for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_cqt[i]; } static mmx_t RTjpeg_ones={0x0001000100010001LL}; static mmx_t RTjpeg_half={0x7fff7fff7fff7fffLL}; static void RTjpeg_quant(__s16 *block, __s32 *qtbl) { int i; mmx_t *bl, *ql; ql=(mmx_t *)qtbl; bl=(mmx_t *)block; movq_m2r(RTjpeg_ones, mm6); movq_m2r(RTjpeg_half, mm7); for(i=16; i; i--) { movq_m2r(*(ql++), mm0); /* quant vals (4) */ movq_m2r(*bl, mm2); /* block vals (4) */ movq_r2r(mm0, mm1); movq_r2r(mm2, mm3); punpcklwd_r2r(mm6, mm0); /* 1 qb 1 qa */ punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */ punpcklwd_r2r(mm7, mm2); /* 32767 bb 32767 ba */ punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */ pmaddwd_r2r(mm2, mm0); /* 32767+bb*qb 32767+ba*qa */ pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */ psrad_i2r(16, mm0); psrad_i2r(16, mm1); packssdw_r2r(mm1, mm0); movq_r2m(mm0, *(bl++)); } } #else static void RTjpeg_quant_init(void) { } static void RTjpeg_quant(__s16 *block, __s32 *qtbl) { int i; for(i=0; i<64; i++) block[i]=(__s16)((block[i]*qtbl[i]+32767)>>16); } #endif /* * Perform the forward DCT on one block of samples. */ #if HAVE_MMX static mmx_t RTjpeg_C4 ={0x2D412D412D412D41LL}; static mmx_t RTjpeg_C6 ={0x187E187E187E187ELL}; static mmx_t RTjpeg_C2mC6={0x22A322A322A322A3LL}; static mmx_t RTjpeg_C2pC6={0x539F539F539F539FLL}; static mmx_t RTjpeg_zero ={0x0000000000000000LL}; #else #define FIX_0_382683433 ((__s32) 98) /* FIX(0.382683433) */ #define FIX_0_541196100 ((__s32) 139) /* FIX(0.541196100) */ #define FIX_0_707106781 ((__s32) 181) /* FIX(0.707106781) */ #define FIX_1_306562965 ((__s32) 334) /* FIX(1.306562965) */ #define DESCALE10(x) (__s16)( ((x)+128) >> 8) #define DESCALE20(x) (__s16)(((x)+32768) >> 16) #define D_MULTIPLY(var,const) ((__s32) ((var) * (const))) #endif static void RTjpeg_dct_init(void) { int i; for(i=0; i<64; i++) { RTjpeg_lqt[i]=(((__u64)RTjpeg_lqt[i]<<32)/RTjpeg_aan_tab[i]); RTjpeg_cqt[i]=(((__u64)RTjpeg_cqt[i]<<32)/RTjpeg_aan_tab[i]); } } static void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip) { #if !HAVE_MMX __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; __s32 tmp10, tmp11, tmp12, tmp13; __s32 z1, z2, z3, z4, z5, z11, z13; __u8 *idataptr; __s16 *odataptr; __s32 *wsptr; int ctr; idataptr = idata; wsptr = RTjpeg_ws; for (ctr = 7; ctr >= 0; ctr--) { tmp0 = idataptr[0] + idataptr[7]; tmp7 = idataptr[0] - idataptr[7]; tmp1 = idataptr[1] + idataptr[6]; tmp6 = idataptr[1] - idataptr[6]; tmp2 = idataptr[2] + idataptr[5]; tmp5 = idataptr[2] - idataptr[5]; tmp3 = idataptr[3] + idataptr[4]; tmp4 = idataptr[3] - idataptr[4]; tmp10 = (tmp0 + tmp3); /* phase 2 */ tmp13 = tmp0 - tmp3; tmp11 = (tmp1 + tmp2); tmp12 = tmp1 - tmp2; wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */ wsptr[4] = (tmp10 - tmp11)<<8; z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ wsptr[2] = (tmp13<<8) + z1; /* phase 5 */ wsptr[6] = (tmp13<<8) - z1; tmp10 = tmp4 + tmp5; /* phase 2 */ tmp11 = tmp5 + tmp6; tmp12 = tmp6 + tmp7; z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ z11 = (tmp7<<8) + z3; /* phase 5 */ z13 = (tmp7<<8) - z3; wsptr[5] = z13 + z2; /* phase 6 */ wsptr[3] = z13 - z2; wsptr[1] = z11 + z4; wsptr[7] = z11 - z4; idataptr += rskip<<3; /* advance pointer to next row */ wsptr += 8; } wsptr = RTjpeg_ws; odataptr=odata; for (ctr = 7; ctr >= 0; ctr--) { tmp0 = wsptr[0] + wsptr[56]; tmp7 = wsptr[0] - wsptr[56]; tmp1 = wsptr[8] + wsptr[48]; tmp6 = wsptr[8] - wsptr[48]; tmp2 = wsptr[16] + wsptr[40]; tmp5 = wsptr[16] - wsptr[40]; tmp3 = wsptr[24] + wsptr[32]; tmp4 = wsptr[24] - wsptr[32]; tmp10 = tmp0 + tmp3; /* phase 2 */ tmp13 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */ odataptr[32] = DESCALE10(tmp10 - tmp11); z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */ odataptr[48] = DESCALE20((tmp13<<8) - z1); tmp10 = tmp4 + tmp5; /* phase 2 */ tmp11 = tmp5 + tmp6; tmp12 = tmp6 + tmp7; z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ z11 = (tmp7<<8) + z3; /* phase 5 */ z13 = (tmp7<<8) - z3; odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */ odataptr[24] = DESCALE20(z13 - z2); odataptr[8] = DESCALE20(z11 + z4); odataptr[56] = DESCALE20(z11 - z4); odataptr++; /* advance pointer to next column */ wsptr++; } #else volatile mmx_t tmp6, tmp7; register mmx_t *dataptr = (mmx_t *)odata; mmx_t *idata2 = (mmx_t *)idata; // first copy the input 8 bit to the destination 16 bits movq_m2r(RTjpeg_zero, mm2); movq_m2r(*idata2, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm2, mm0); movq_r2m(mm0, *(dataptr)); punpckhbw_r2r(mm2, mm1); movq_r2m(mm1, *(dataptr+1)); idata2 += rskip; movq_m2r(*idata2, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm2, mm0); movq_r2m(mm0, *(dataptr+2)); punpckhbw_r2r(mm2, mm1); movq_r2m(mm1, *(dataptr+3)); idata2 += rskip; movq_m2r(*idata2, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm2, mm0); movq_r2m(mm0, *(dataptr+4)); punpckhbw_r2r(mm2, mm1); movq_r2m(mm1, *(dataptr+5)); idata2 += rskip; movq_m2r(*idata2, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm2, mm0); movq_r2m(mm0, *(dataptr+6)); punpckhbw_r2r(mm2, mm1); movq_r2m(mm1, *(dataptr+7)); idata2 += rskip; movq_m2r(*idata2, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm2, mm0); movq_r2m(mm0, *(dataptr+8)); punpckhbw_r2r(mm2, mm1); movq_r2m(mm1, *(dataptr+9)); idata2 += rskip; movq_m2r(*idata2, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm2, mm0); movq_r2m(mm0, *(dataptr+10)); punpckhbw_r2r(mm2, mm1); movq_r2m(mm1, *(dataptr+11)); idata2 += rskip; movq_m2r(*idata2, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm2, mm0); movq_r2m(mm0, *(dataptr+12)); punpckhbw_r2r(mm2, mm1); movq_r2m(mm1, *(dataptr+13)); idata2 += rskip; movq_m2r(*idata2, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm2, mm0); movq_r2m(mm0, *(dataptr+14)); punpckhbw_r2r(mm2, mm1); movq_r2m(mm1, *(dataptr+15)); /* Start Transpose to do calculations on rows */ movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into m5 movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2 movq_r2r(mm7, mm5); punpcklwd_m2r(*(dataptr+11), mm7); // m11:m01|m10:m00 - interleave first and second lines movq_r2r(mm6, mm2); punpcklwd_m2r(*(dataptr+15), mm6); // m31:m21|m30:m20 - interleave third and fourth lines movq_r2r(mm7, mm1); movq_m2r(*(dataptr+11), mm3); // m13:m13|m11:m10 - second line punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1 movq_m2r(*(dataptr+15), mm0); // m13:m13|m11:m10 - fourth line punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2 movq_r2m(mm7,*(dataptr+9)); // write result 1 punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines movq_r2m(mm1,*(dataptr+11)); // write result 2 punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines movq_r2r(mm5, mm1); punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3 movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4 punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4 movq_r2m(mm5,*(dataptr+13)); // write result 3 // last 4x4 done movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4 movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line movq_r2r(mm0, mm6); punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines movq_r2r(mm2, mm7); punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines movq_r2r(mm0, mm4); // movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines movq_r2r(mm1, mm2); // copy first line punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines movq_r2r(mm6, mm5); // copy first intermediate result movq_r2m(mm0, *(dataptr+8)); // write result 1 punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines movq_r2r(mm3, mm0); // copy third line punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines movq_r2m(mm4, *(dataptr+10)); // write result 2 out punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result punpcklwd_m2r(*(dataptr+14), mm3); // n31:n21|n30:n20 - interleave third and fourth lines movq_r2r(mm1, mm4); movq_r2m(mm6, *(dataptr+12)); // write result 3 out punpckldq_r2r(mm3, mm1); // n30:n20|n10:n00 - produce first result punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines movq_r2r(mm2, mm6); movq_r2m(mm5, *(dataptr+14)); // write result 4 out punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block) punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result movq_r2m(mm4, *(dataptr+3)); // write result 6 out punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result movq_r2m(mm2, *(dataptr+5)); // write result 7 out movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4 movq_r2m(mm6, *(dataptr+7)); // write result 8 out // Do first 4x4 quadrant, which is used in the beginning of the DCT: movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line movq_r2r(mm0, mm2); punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines movq_r2r(mm7, mm4); punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines movq_r2r(mm0, mm1); movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1 movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2 movq_r2r(mm0, mm7); // write result 1 punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */ movq_r2r(mm1, mm6); // write result 2 paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */ punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */ movq_r2r(mm2, mm3); // copy first intermediate result psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */ punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3 movq_r2m(mm7, tmp7); movq_r2r(mm2, mm5); // write result 3 movq_r2m(mm6, tmp6); punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4 paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+5 /* Stage 1 */ movq_r2r(mm3, mm4); // write result 4 /************************************************************************************************ End of Transpose ************************************************************************************************/ paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/ movq_r2r(mm0, mm7); psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/ movq_r2r(mm1, mm6); paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */ psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */ psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */ paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */ psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/ paddw_r2r(mm7, mm6); // tmp12 + tmp13 /* stage 3 */ movq_m2r(tmp6, mm2); movq_r2r(mm0, mm3); psllw_i2r(2, mm6); // m8 * 2^2 paddw_r2r(mm1, mm0); pmulhw_m2r(RTjpeg_C4, mm6); // z1 psubw_r2r(mm1, mm3); movq_r2m(mm0, *dataptr); movq_r2r(mm7, mm0); /* Odd part */ movq_r2m(mm3, *(dataptr+8)); paddw_r2r(mm5, mm4); // tmp10 movq_m2r(tmp7, mm3); paddw_r2r(mm6, mm0); // tmp32 paddw_r2r(mm2, mm5); // tmp11 psubw_r2r(mm6, mm7); // tmp33 movq_r2m(mm0, *(dataptr+4)); paddw_r2r(mm3, mm2); // tmp12 /* stage 4 */ movq_r2m(mm7, *(dataptr+12)); movq_r2r(mm4, mm1); // copy of tmp10 psubw_r2r(mm2, mm1); // tmp10 - tmp12 psllw_i2r(2, mm4); // m8 * 2^2 movq_m2r(RTjpeg_C2mC6, mm0); psllw_i2r(2, mm1); pmulhw_m2r(RTjpeg_C6, mm1); // z5 psllw_i2r(2, mm2); pmulhw_r2r(mm0, mm4); // z5 /* stage 5 */ pmulhw_m2r(RTjpeg_C2pC6, mm2); psllw_i2r(2, mm5); pmulhw_m2r(RTjpeg_C4, mm5); // z3 movq_r2r(mm3, mm0); // copy tmp7 movq_m2r(*(dataptr+1), mm7); paddw_r2r(mm1, mm4); // z2 paddw_r2r(mm1, mm2); // z4 paddw_r2r(mm5, mm0); // z11 psubw_r2r(mm5, mm3); // z13 /* stage 6 */ movq_r2r(mm3, mm5); // copy z13 psubw_r2r(mm4, mm3); // y3=z13 - z2 paddw_r2r(mm4, mm5); // y5=z13 + z2 movq_r2r(mm0, mm6); // copy z11 movq_r2m(mm3, *(dataptr+6)); //save y3 psubw_r2r(mm2, mm0); // y7=z11 - z4 movq_r2m(mm5, *(dataptr+10)); //save y5 paddw_r2r(mm2, mm6); // y1=z11 + z4 movq_r2m(mm0, *(dataptr+14)); //save y7 /************************************************ * End of 1st 4 rows ************************************************/ movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */ movq_r2r(mm7, mm0); // copy x0 movq_r2m(mm6, *(dataptr+2)); //save y1 movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */ movq_r2r(mm1, mm6); // copy x1 paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7 movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */ movq_r2r(mm2, mm5); // copy x2 psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7 movq_r2r(mm3, mm4); // copy x3 paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6 movq_r2m(mm7, tmp7); // save tmp07 movq_r2r(mm0, mm7); // copy tmp00 psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6 /* stage 2, Even Part */ paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4 movq_r2m(mm6, tmp6); // save tmp07 movq_r2r(mm1, mm6); // copy tmp01 paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5 paddw_r2r(mm7, mm6); // tmp12 + tmp13 /* stage 3, Even and stage 4 & 5 even */ movq_m2r(tmp6, mm2); // load tmp6 movq_r2r(mm0, mm3); // copy tmp10 psllw_i2r(2, mm6); // shift z1 paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11 pmulhw_m2r(RTjpeg_C4, mm6); // z1 psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11 movq_r2m(mm0, *(dataptr+1)); //save y0 movq_r2r(mm7, mm0); // copy tmp13 /* odd part */ movq_r2m(mm3, *(dataptr+9)); //save y4 paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5 movq_m2r(tmp7, mm3); // load tmp7 paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1 paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6 psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1 movq_r2m(mm0, *(dataptr+5)); //save y2 paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7 /* stage 4 */ movq_r2m(mm7, *(dataptr+13)); //save y6 movq_r2r(mm4, mm1); // copy tmp10 psubw_r2r(mm2, mm1); // tmp10 - tmp12 psllw_i2r(2, mm4); // shift tmp10 movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6 psllw_i2r(2, mm1); // shift (tmp10-tmp12) pmulhw_m2r(RTjpeg_C6, mm1); // z5 psllw_i2r(2, mm5); // prepare for multiply pmulhw_r2r(mm0, mm4); // multiply by converted real /* stage 5 */ pmulhw_m2r(RTjpeg_C4, mm5); // z3 psllw_i2r(2, mm2); // prepare for multiply pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply movq_r2r(mm3, mm0); // copy tmp7 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7 paddw_r2r(mm1, mm4); // z2 paddw_r2r(mm5, mm0); // z11 psubw_r2r(mm5, mm3); // z13 /* stage 6 */ movq_r2r(mm3, mm5); // copy z13 paddw_r2r(mm1, mm2); // z4 movq_r2r(mm0, mm6); // copy z11 psubw_r2r(mm4, mm5); // y3 paddw_r2r(mm2, mm6); // y1 paddw_r2r(mm4, mm3); // y5 movq_r2m(mm5, *(dataptr+7)); //save y3 movq_r2m(mm6, *(dataptr+3)); //save y1 psubw_r2r(mm2, mm0); // y7 /************************************************************************************************ Start of Transpose ************************************************************************************************/ movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2 movq_r2r(mm7, mm5); // copy first line punpcklwd_r2r(mm3, mm7); // m11:m01|m10:m00 - interleave first and second lines movq_r2r(mm6, mm2); // copy third line punpcklwd_r2r(mm0, mm6); // m31:m21|m30:m20 - interleave third and fourth lines movq_r2r(mm7, mm1); // copy first intermediate result punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1 punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2 movq_r2m(mm7, *(dataptr+9)); // write result 1 punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines movq_r2m(mm1, *(dataptr+11)); // write result 2 punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines movq_r2r(mm5, mm1); // copy first intermediate result punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3 movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4 punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4 movq_r2m(mm5, *(dataptr+13)); // write result 3 /****** last 4x4 done */ movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4 movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line movq_r2r(mm0, mm6); // copy first line punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines movq_r2r(mm2, mm7); // copy third line punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines movq_r2r(mm0, mm4); // copy first intermediate result movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines movq_r2r(mm1, mm2); // copy first line punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines movq_r2r(mm6, mm5); // copy first intermediate result movq_r2m(mm0, *(dataptr+8)); // write result 1 punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines movq_r2r(mm3, mm0); // copy third line punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines movq_r2m(mm4, *(dataptr+10)); // write result 2 out punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result punpcklwd_m2r(*(dataptr+14), mm3); // n33:n23|n32:n22 - interleave third and fourth lines movq_r2r(mm1, mm4); // copy second intermediate result movq_r2m(mm6, *(dataptr+12)); // write result 3 out punpckldq_r2r(mm3, mm1); // punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines movq_r2r(mm2, mm6); // copy second intermediate result movq_r2m(mm5, *(dataptr+14)); // write result 4 out punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block) punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result movq_r2m(mm4, *(dataptr+3)); // write result 6 out punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result movq_r2m(mm2, *(dataptr+5)); // write result 7 out movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4 movq_r2m(mm6, *(dataptr+7)); // write result 8 out // Do first 4x4 quadrant, which is used in the beginning of the DCT: movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line movq_r2r(mm0, mm2); // copy first line punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines movq_r2r(mm7, mm4); // copy third line punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines movq_r2r(mm0, mm1); // copy first intermediate result movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1 movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2 movq_r2r(mm0, mm7); // write result 1 punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */ movq_r2r(mm1, mm6); // write result 2 paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */ punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */ movq_r2r(mm2, mm3); // copy first intermediate result psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */ punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3 movq_r2m(mm7, tmp7); // save tmp07 movq_r2r(mm2, mm5); // write result 3 movq_r2m(mm6, tmp6); // save tmp06 punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4 paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+x5 /* stage 1 */ movq_r2r(mm3, mm4); // write result 4 /************************************************************************************************ End of Transpose 2 ************************************************************************************************/ paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/ movq_r2r(mm0, mm7); psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/ movq_r2r(mm1, mm6); paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */ psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */ psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */ paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */ psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/ paddw_r2r(mm7, mm6); // tmp12 + tmp13 /* stage 3 */ movq_m2r(tmp6, mm2); movq_r2r(mm0, mm3); psllw_i2r(2, mm6); // m8 * 2^2 paddw_r2r(mm1, mm0); pmulhw_m2r(RTjpeg_C4, mm6); // z1 psubw_r2r(mm1, mm3); movq_r2m(mm0, *dataptr); movq_r2r(mm7, mm0); /* Odd part */ movq_r2m(mm3, *(dataptr+8)); paddw_r2r(mm5, mm4); // tmp10 movq_m2r(tmp7, mm3); paddw_r2r(mm6, mm0); // tmp32 paddw_r2r(mm2, mm5); // tmp11 psubw_r2r(mm6, mm7); // tmp33 movq_r2m(mm0, *(dataptr+4)); paddw_r2r(mm3, mm2); // tmp12 /* stage 4 */ movq_r2m(mm7, *(dataptr+12)); movq_r2r(mm4, mm1); // copy of tmp10 psubw_r2r(mm2, mm1); // tmp10 - tmp12 psllw_i2r(2, mm4); // m8 * 2^2 movq_m2r(RTjpeg_C2mC6, mm0); psllw_i2r(2, mm1); pmulhw_m2r(RTjpeg_C6, mm1); // z5 psllw_i2r(2, mm2); pmulhw_r2r(mm0, mm4); // z5 /* stage 5 */ pmulhw_m2r(RTjpeg_C2pC6, mm2); psllw_i2r(2, mm5); pmulhw_m2r(RTjpeg_C4, mm5); // z3 movq_r2r(mm3, mm0); // copy tmp7 movq_m2r(*(dataptr+1), mm7); paddw_r2r(mm1, mm4); // z2 paddw_r2r(mm1, mm2); // z4 paddw_r2r(mm5, mm0); // z11 psubw_r2r(mm5, mm3); // z13 /* stage 6 */ movq_r2r(mm3, mm5); // copy z13 psubw_r2r(mm4, mm3); // y3=z13 - z2 paddw_r2r(mm4, mm5); // y5=z13 + z2 movq_r2r(mm0, mm6); // copy z11 movq_r2m(mm3, *(dataptr+6)); //save y3 psubw_r2r(mm2, mm0); // y7=z11 - z4 movq_r2m(mm5, *(dataptr+10)); //save y5 paddw_r2r(mm2, mm6); // y1=z11 + z4 movq_r2m(mm0, *(dataptr+14)); //save y7 /************************************************ * End of 1st 4 rows ************************************************/ movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */ movq_r2r(mm7, mm0); // copy x0 movq_r2m(mm6, *(dataptr+2)); //save y1 movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */ movq_r2r(mm1, mm6); // copy x1 paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7 movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */ movq_r2r(mm2, mm5); // copy x2 psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7 movq_r2r(mm3, mm4); // copy x3 paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6 movq_r2m(mm7, tmp7); // save tmp07 movq_r2r(mm0, mm7); // copy tmp00 psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6 /* stage 2, Even Part */ paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4 movq_r2m(mm6, tmp6); // save tmp07 movq_r2r(mm1, mm6); // copy tmp01 paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5 paddw_r2r(mm7, mm6); // tmp12 + tmp13 /* stage 3, Even and stage 4 & 5 even */ movq_m2r(tmp6, mm2); // load tmp6 movq_r2r(mm0, mm3); // copy tmp10 psllw_i2r(2, mm6); // shift z1 paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11 pmulhw_m2r(RTjpeg_C4, mm6); // z1 psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11 movq_r2m(mm0, *(dataptr+1)); //save y0 movq_r2r(mm7, mm0); // copy tmp13 /* odd part */ movq_r2m(mm3, *(dataptr+9)); //save y4 paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5 movq_m2r(tmp7, mm3); // load tmp7 paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1 paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6 psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1 movq_r2m(mm0, *(dataptr+5)); //save y2 paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7 /* stage 4 */ movq_r2m(mm7, *(dataptr+13)); //save y6 movq_r2r(mm4, mm1); // copy tmp10 psubw_r2r(mm2, mm1); // tmp10 - tmp12 psllw_i2r(2, mm4); // shift tmp10 movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6 psllw_i2r(2, mm1); // shift (tmp10-tmp12) pmulhw_m2r(RTjpeg_C6, mm1); // z5 psllw_i2r(2, mm5); // prepare for multiply pmulhw_r2r(mm0, mm4); // multiply by converted real /* stage 5 */ pmulhw_m2r(RTjpeg_C4, mm5); // z3 psllw_i2r(2, mm2); // prepare for multiply pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply movq_r2r(mm3, mm0); // copy tmp7 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7 paddw_r2r(mm1, mm4); // z2 paddw_r2r(mm5, mm0); // z11 psubw_r2r(mm5, mm3); // z13 /* stage 6 */ movq_r2r(mm3, mm5); // copy z13 paddw_r2r(mm1, mm2); // z4 movq_r2r(mm0, mm6); // copy z11 psubw_r2r(mm4, mm5); // y3 paddw_r2r(mm2, mm6); // y1 paddw_r2r(mm4, mm3); // y5 movq_r2m(mm5, *(dataptr+7)); //save y3 psubw_r2r(mm2, mm0); // yè=z11 - z4 movq_r2m(mm3, *(dataptr+11)); //save y5 movq_r2m(mm6, *(dataptr+3)); //save y1 movq_r2m(mm0, *(dataptr+15)); //save y7 #endif } /* Main Routines This file contains most of the initialisation and control functions (C) Justin Schoeman 1998 */ /* Private function Initialise all the cache-aliged data blocks */ static void RTjpeg_init_data(void) { unsigned long dptr; dptr=(unsigned long)&(RTjpeg_alldata[0]); dptr+=32; dptr=dptr>>5; dptr=dptr<<5; /* cache align data */ RTjpeg_block=(__s16 *)dptr; dptr+=sizeof(__s16)*64; RTjpeg_lqt=(__s32 *)dptr; dptr+=sizeof(__s32)*64; RTjpeg_cqt=(__s32 *)dptr; dptr+=sizeof(__s32)*64; RTjpeg_liqt=(__u32 *)dptr; dptr+=sizeof(__u32)*64; RTjpeg_ciqt=(__u32 *)dptr; } /* External Function Re-set quality factor Input: buf -> pointer to 128 ints for quant values store to pass back to init_decompress. Q -> quality factor (192=best, 32=worst) */ static void RTjpeg_init_Q(__u8 Q) { int i; __u64 qual; qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */ for(i=0; i<64; i++) { RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3); if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1; RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3); if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1; RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3); RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3); RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3; RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3; } RTjpeg_lb8=0; while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8); RTjpeg_lb8--; RTjpeg_cb8=0; while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8); RTjpeg_cb8--; RTjpeg_dct_init(); RTjpeg_quant_init(); } /* External Function Initialise compression. Input: buf -> pointer to 128 ints for quant values store to pass back to init_decompress. width -> width of image height -> height of image Q -> quality factor (192=best, 32=worst) */ void RTjpeg_init_compress(__u32 *buf, int width, int height, __u8 Q) { int i; __u64 qual; RTjpeg_init_data(); RTjpeg_width=width; RTjpeg_height=height; RTjpeg_Ywidth = RTjpeg_width>>3; RTjpeg_Ysize=width * height; RTjpeg_Cwidth = RTjpeg_width>>4; RTjpeg_Csize= (width>>1) * height; qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */ for(i=0; i<64; i++) { RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3); if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1; RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3); if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1; RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3); RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3); RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3; RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3; } RTjpeg_lb8=0; while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8); RTjpeg_lb8--; RTjpeg_cb8=0; while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8); RTjpeg_cb8--; RTjpeg_dct_init(); RTjpeg_quant_init(); for(i=0; i<64; i++) buf[i]=le2me_32(RTjpeg_liqt[i]); for(i=0; i<64; i++) buf[64+i]=le2me_32(RTjpeg_ciqt[i]); } int RTjpeg_compressYUV420(__s8 *sp, unsigned char *bp) { __s8 * sb; register __s8 * bp1 = bp + (RTjpeg_width<<3); register __s8 * bp2 = bp + RTjpeg_Ysize; register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1); register int i, j, k; #if HAVE_MMX emms(); #endif sb=sp; /* Y */ for(i=RTjpeg_height>>1; i; i-=8) { for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8) { RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth); RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth); RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth); RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth); RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth); RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth); RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); } bp+=RTjpeg_width<<4; bp1+=RTjpeg_width<<4; bp2+=RTjpeg_width<<2; bp3+=RTjpeg_width<<2; } #if HAVE_MMX emms(); #endif return (sp-sb); } /* External Function Initialise additional data structures for motion compensation */ void RTjpeg_init_mcompress(void) { unsigned long tmp; if(!RTjpeg_old) { RTjpeg_old=malloc((4*RTjpeg_width*RTjpeg_height)+32); tmp=(unsigned long)RTjpeg_old; tmp+=32; tmp=tmp>>5; RTjpeg_old=(__s16 *)(tmp<<5); } if (!RTjpeg_old) { fprintf(stderr, "RTjpeg: Could not allocate memory\n"); exit(-1); } memset(RTjpeg_old, 0, ((4*RTjpeg_width*RTjpeg_height))); } #if HAVE_MMX static int RTjpeg_bcomp(__s16 *old, mmx_t *mask) { int i; mmx_t *mold=(mmx_t *)old; mmx_t *mblock=(mmx_t *)RTjpeg_block; volatile mmx_t result; static mmx_t neg={0xffffffffffffffffULL}; movq_m2r(*mask, mm7); movq_m2r(neg, mm6); pxor_r2r(mm5, mm5); for(i=0; i<8; i++) { movq_m2r(*(mblock++), mm0); movq_m2r(*(mblock++), mm2); movq_m2r(*(mold++), mm1); movq_m2r(*(mold++), mm3); psubsw_r2r(mm1, mm0); psubsw_r2r(mm3, mm2); movq_r2r(mm0, mm1); movq_r2r(mm2, mm3); pcmpgtw_r2r(mm7, mm0); pcmpgtw_r2r(mm7, mm2); pxor_r2r(mm6, mm1); pxor_r2r(mm6, mm3); pcmpgtw_r2r(mm7, mm1); pcmpgtw_r2r(mm7, mm3); por_r2r(mm0, mm5); por_r2r(mm2, mm5); por_r2r(mm1, mm5); por_r2r(mm3, mm5); } movq_r2m(mm5, result); if(result.q) { return 0; } return 1; } #else static int RTjpeg_bcomp(__s16 *old, __u16 *mask) { int i; for(i=0; i<64; i++) if(abs(old[i]-RTjpeg_block[i])>*mask) { for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i]; return 0; } return 1; } #endif int RTjpeg_mcompressYUV420(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask) { __s8 * sb; register __s8 * bp1 = bp + (RTjpeg_width<<3); register __s8 * bp2 = bp + RTjpeg_Ysize; register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1); register int i, j, k; #if HAVE_MMX emms(); RTjpeg_lmask.uq=((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask; RTjpeg_cmask.uq=((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask; #else RTjpeg_lmask=lmask; RTjpeg_cmask=cmask; #endif sb=sp; block=RTjpeg_old; /* Y */ for(i=RTjpeg_height>>1; i; i-=8) { for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8) { RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth); RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); if(RTjpeg_bcomp(block, &RTjpeg_lmask)) { *((__u8 *)sp++)=255; } else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); block+=64; RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth); RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); if(RTjpeg_bcomp(block, &RTjpeg_lmask)) { *((__u8 *)sp++)=255; } else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); block+=64; RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth); RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); if(RTjpeg_bcomp(block, &RTjpeg_lmask)) { *((__u8 *)sp++)=255; } else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); block+=64; RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth); RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); if(RTjpeg_bcomp(block, &RTjpeg_lmask)) { *((__u8 *)sp++)=255; } else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); block+=64; RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth); RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); if(RTjpeg_bcomp(block, &RTjpeg_cmask)) { *((__u8 *)sp++)=255; } else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); block+=64; RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth); RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); if(RTjpeg_bcomp(block, &RTjpeg_cmask)) { *((__u8 *)sp++)=255; } else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); block+=64; } bp+=RTjpeg_width<<4; bp1+=RTjpeg_width<<4; bp2+=RTjpeg_width<<2; bp3+=RTjpeg_width<<2; } #if HAVE_MMX emms(); #endif return (sp-sb); }