# HG changeset patch # User michael # Date 1003941580 0 # Node ID b74c2a08eac996cf75f672c4a88e220ef2a6c005 # Parent f8978836ef7f550be9771f9863fa4d05e57c13fd much better horizontal filters (transpose & use the vertical ones) :) bugfix bugs? diff -r f8978836ef7f -r b74c2a08eac9 postproc/postprocess.c --- a/postproc/postprocess.c Wed Oct 24 16:22:10 2001 +0000 +++ b/postproc/postprocess.c Wed Oct 24 16:39:40 2001 +0000 @@ -23,9 +23,9 @@ doVertLowPass E e e doVertDefFilter Ec Ec Ec isHorizDC Ec Ec -isHorizMinMaxOk a -doHorizLowPass E a a -doHorizDefFilter E ac ac +isHorizMinMaxOk a E +doHorizLowPass E e e +doHorizDefFilter E E E deRing Vertical RKAlgo1 E a a Vertical X1 a E E @@ -60,7 +60,6 @@ split this huge file fix warnings (unused vars, ...) noise reduction filters -write an exact implementation of the horizontal delocking filter ... Notes: @@ -128,7 +127,7 @@ static uint64_t temp4=0; static uint64_t temp5=0; static uint64_t pQPb=0; -static uint8_t tempBlock[16*16]; //used so the horizontal code gets aligned data +static uint8_t tempBlocks[8*16*2]; //used for the horizontal code int hFlatnessThreshold= 56 - 16; int vFlatnessThreshold= 56 - 16; @@ -277,6 +276,7 @@ "movd %%mm0, %0 \n\t" : "=r" (numEq) : "r" (src), "r" (stride) + : "%eax", "%ebx" ); numEq= (256 - numEq) &0xFF; @@ -850,7 +850,7 @@ } } -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) +#if 0 asm volatile( "pxor %%mm7, %%mm7 \n\t" // 0 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE @@ -1295,13 +1295,13 @@ //FIXME? |255-0| = 1 /** - * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock. + * Check if the given 8x8 Block is mostly "flat" */ -static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride) +static inline int isHorizDC(uint8_t src[], int stride) { // src++; int numEq= 0; -#ifdef HAVE_MMX +#if 0 asm volatile ( // "int $3 \n\t" "leal (%1, %2), %%ecx \n\t" @@ -1386,14 +1386,6 @@ if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++; if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++; if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++; - tempBlock[0 + y*TEMP_STRIDE] = src[0]; - tempBlock[1 + y*TEMP_STRIDE] = src[1]; - tempBlock[2 + y*TEMP_STRIDE] = src[2]; - tempBlock[3 + y*TEMP_STRIDE] = src[3]; - tempBlock[4 + y*TEMP_STRIDE] = src[4]; - tempBlock[5 + y*TEMP_STRIDE] = src[5]; - tempBlock[6 + y*TEMP_STRIDE] = src[6]; - tempBlock[7 + y*TEMP_STRIDE] = src[7]; src+= stride; } #endif @@ -1416,40 +1408,14 @@ static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) { -#ifdef MMX_FIXME -FIXME - int isOk; - asm volatile( -// "int $3 \n\t" - "movq (%1, %2), %%mm0 \n\t" - "movq (%1, %2, 8), %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "psubusb %%mm1, %%mm0 \n\t" - "psubusb %%mm2, %%mm1 \n\t" - "por %%mm1, %%mm0 \n\t" // ABS Diff - - "movq pQPb, %%mm7 \n\t" // QP,..., QP - "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP - "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 - "pcmpeqd b00, %%mm0 \n\t" - "psrlq $16, %%mm0 \n\t" - "pcmpeqd bFF, %%mm0 \n\t" -// "movd %%mm0, (%1, %2, 4)\n\t" - "movd %%mm0, %0 \n\t" - : "=r" (isOk) - : "r" (src), "r" (stride) - ); - return isOk; -#else if(abs(src[0] - src[7]) > 2*QP) return 0; return 1; -#endif } -static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) +static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP) { -#ifdef HAVE_MMX +#if 0 asm volatile( "leal (%0, %1), %%ecx \n\t" "leal (%%ecx, %1, 4), %%ebx \n\t" @@ -1536,27 +1502,16 @@ : "%eax", "%ebx", "%ecx" ); #else - uint8_t *src= tempBlock; - int y; for(y=0; y>4; dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4; @@ -1830,12 +1783,10 @@ dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4; dst+= stride; - temp+= TEMP_STRIDE; } #endif } - static inline void dering(uint8_t src[], int stride, int QP) { //FIXME @@ -2185,6 +2136,171 @@ #endif } +/** + * transposes and shift the given 8x8 Block into dst1 and dst2 + */ +static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) +{ + asm( + "leal (%0, %1), %%eax \n\t" + "leal (%%eax, %1, 4), %%ebx \n\t" +// 0 1 2 3 4 5 6 7 8 9 +// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 + "movq (%0), %%mm0 \n\t" // 12345678 + "movq (%%eax), %%mm1 \n\t" // abcdefgh + "movq %%mm0, %%mm2 \n\t" // 12345678 + "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d + "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h + + "movq (%%eax, %1), %%mm1 \n\t" + "movq (%%eax, %1, 2), %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "punpcklbw %%mm3, %%mm1 \n\t" + "punpckhbw %%mm3, %%mm4 \n\t" + + "movq %%mm0, %%mm3 \n\t" + "punpcklwd %%mm1, %%mm0 \n\t" + "punpckhwd %%mm1, %%mm3 \n\t" + "movq %%mm2, %%mm1 \n\t" + "punpcklwd %%mm4, %%mm2 \n\t" + "punpckhwd %%mm4, %%mm1 \n\t" + + "movd %%mm0, 128(%2) \n\t" + "psrlq $32, %%mm0 \n\t" + "movd %%mm0, 144(%2) \n\t" + "movd %%mm3, 160(%2) \n\t" + "psrlq $32, %%mm3 \n\t" + "movd %%mm3, 176(%2) \n\t" + "movd %%mm3, 48(%3) \n\t" + "movd %%mm2, 192(%2) \n\t" + "movd %%mm2, 64(%3) \n\t" + "psrlq $32, %%mm2 \n\t" + "movd %%mm2, 80(%3) \n\t" + "movd %%mm1, 96(%3) \n\t" + "psrlq $32, %%mm1 \n\t" + "movd %%mm1, 112(%3) \n\t" + + "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 + "movq (%%ebx), %%mm1 \n\t" // abcdefgh + "movq %%mm0, %%mm2 \n\t" // 12345678 + "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d + "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h + + "movq (%%ebx, %1), %%mm1 \n\t" + "movq (%%ebx, %1, 2), %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "punpcklbw %%mm3, %%mm1 \n\t" + "punpckhbw %%mm3, %%mm4 \n\t" + + "movq %%mm0, %%mm3 \n\t" + "punpcklwd %%mm1, %%mm0 \n\t" + "punpckhwd %%mm1, %%mm3 \n\t" + "movq %%mm2, %%mm1 \n\t" + "punpcklwd %%mm4, %%mm2 \n\t" + "punpckhwd %%mm4, %%mm1 \n\t" + + "movd %%mm0, 132(%2) \n\t" + "psrlq $32, %%mm0 \n\t" + "movd %%mm0, 148(%2) \n\t" + "movd %%mm3, 164(%2) \n\t" + "psrlq $32, %%mm3 \n\t" + "movd %%mm3, 180(%2) \n\t" + "movd %%mm3, 52(%3) \n\t" + "movd %%mm2, 196(%2) \n\t" + "movd %%mm2, 68(%3) \n\t" + "psrlq $32, %%mm2 \n\t" + "movd %%mm2, 84(%3) \n\t" + "movd %%mm1, 100(%3) \n\t" + "psrlq $32, %%mm1 \n\t" + "movd %%mm1, 116(%3) \n\t" + + + :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) + : "%eax", "%ebx" + ); +} + +/** + * transposes the given 8x8 block + */ +static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src) +{ + asm( + "leal (%0, %1), %%eax \n\t" + "leal (%%eax, %1, 4), %%ebx \n\t" +// 0 1 2 3 4 5 6 7 8 9 +// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 + "movq (%2), %%mm0 \n\t" // 12345678 + "movq 16(%2), %%mm1 \n\t" // abcdefgh + "movq %%mm0, %%mm2 \n\t" // 12345678 + "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d + "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h + + "movq 32(%2), %%mm1 \n\t" + "movq 48(%2), %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "punpcklbw %%mm3, %%mm1 \n\t" + "punpckhbw %%mm3, %%mm4 \n\t" + + "movq %%mm0, %%mm3 \n\t" + "punpcklwd %%mm1, %%mm0 \n\t" + "punpckhwd %%mm1, %%mm3 \n\t" + "movq %%mm2, %%mm1 \n\t" + "punpcklwd %%mm4, %%mm2 \n\t" + "punpckhwd %%mm4, %%mm1 \n\t" + + "movd %%mm0, (%0) \n\t" + "psrlq $32, %%mm0 \n\t" + "movd %%mm0, (%%eax) \n\t" + "movd %%mm3, (%%eax, %1) \n\t" + "psrlq $32, %%mm3 \n\t" + "movd %%mm3, (%%eax, %1, 2) \n\t" + "movd %%mm2, (%0, %1, 4) \n\t" + "psrlq $32, %%mm2 \n\t" + "movd %%mm2, (%%ebx) \n\t" + "movd %%mm1, (%%ebx, %1) \n\t" + "psrlq $32, %%mm1 \n\t" + "movd %%mm1, (%%ebx, %1, 2) \n\t" + + + "movq 64(%2), %%mm0 \n\t" // 12345678 + "movq 80(%2), %%mm1 \n\t" // abcdefgh + "movq %%mm0, %%mm2 \n\t" // 12345678 + "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d + "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h + + "movq 96(%2), %%mm1 \n\t" + "movq 112(%2), %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "punpcklbw %%mm3, %%mm1 \n\t" + "punpckhbw %%mm3, %%mm4 \n\t" + + "movq %%mm0, %%mm3 \n\t" + "punpcklwd %%mm1, %%mm0 \n\t" + "punpckhwd %%mm1, %%mm3 \n\t" + "movq %%mm2, %%mm1 \n\t" + "punpcklwd %%mm4, %%mm2 \n\t" + "punpckhwd %%mm4, %%mm1 \n\t" + + "movd %%mm0, 4(%0) \n\t" + "psrlq $32, %%mm0 \n\t" + "movd %%mm0, 4(%%eax) \n\t" + "movd %%mm3, 4(%%eax, %1) \n\t" + "psrlq $32, %%mm3 \n\t" + "movd %%mm3, 4(%%eax, %1, 2) \n\t" + "movd %%mm2, 4(%0, %1, 4) \n\t" + "psrlq $32, %%mm2 \n\t" + "movd %%mm2, 4(%%ebx) \n\t" + "movd %%mm1, 4(%%ebx, %1) \n\t" + "psrlq $32, %%mm1 \n\t" + "movd %%mm1, 4(%%ebx, %1, 2) \n\t" + + :: "r" (dst), "r" (dstStride), "r" (src) + : "%eax", "%ebx" + ); +} + + #ifdef HAVE_ODIVX_POSTPROCESS #include "../opendivx/postprocess.h" int use_old_pp=0; @@ -2710,6 +2826,8 @@ int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4); int QPFrac= QPDelta; + uint8_t *tempBlock1= tempBlocks; + uint8_t *tempBlock2= tempBlocks + 8; #endif /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not than use a temporary buffer */ @@ -2742,6 +2860,7 @@ for(x=0; x= 0) { #ifdef MORE_TIMING T0= rdtsc(); #endif +#ifdef HAVE_MMX + if(mode & H_RK1_FILTER) + vertRK1Filter(tempBlock1, 16, QP); + else if(mode & H_X1_FILTER) + vertX1Filter(tempBlock1, 16, QP); + else if(mode & H_DEBLOCK) + { + if( isVertDC(tempBlock1, 16)) + { + if(isVertMinMaxOk(tempBlock1, 16, QP)) + doVertLowPass(tempBlock1, 16, QP); + } + else + doVertDefFilter(tempBlock1, 16, QP); + } + + transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16); + +#else if(mode & H_X1_FILTER) horizX1Filter(dstBlock-4, stride, QP); else if(mode & H_DEBLOCK) { - if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) + if( isHorizDC(dstBlock-4, stride)) { - if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) - doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); + if(isHorizMinMaxOk(dstBlock-4, stride, QP)) + doHorizLowPass(dstBlock-4, stride, QP); } else - doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); + doHorizDefFilter(dstBlock-4, stride, QP); } +#endif #ifdef MORE_TIMING T1= rdtsc(); horizTime+= T1-T0; @@ -2929,6 +3070,10 @@ dstBlock+=8; srcBlock+=8; + + tmpXchg= tempBlock1; + tempBlock1= tempBlock2; + tempBlock2 = tmpXchg; } /* did we use a tmp buffer */ diff -r f8978836ef7f -r b74c2a08eac9 postproc/postprocess_template.c --- a/postproc/postprocess_template.c Wed Oct 24 16:22:10 2001 +0000 +++ b/postproc/postprocess_template.c Wed Oct 24 16:39:40 2001 +0000 @@ -23,9 +23,9 @@ doVertLowPass E e e doVertDefFilter Ec Ec Ec isHorizDC Ec Ec -isHorizMinMaxOk a -doHorizLowPass E a a -doHorizDefFilter E ac ac +isHorizMinMaxOk a E +doHorizLowPass E e e +doHorizDefFilter E E E deRing Vertical RKAlgo1 E a a Vertical X1 a E E @@ -60,7 +60,6 @@ split this huge file fix warnings (unused vars, ...) noise reduction filters -write an exact implementation of the horizontal delocking filter ... Notes: @@ -128,7 +127,7 @@ static uint64_t temp4=0; static uint64_t temp5=0; static uint64_t pQPb=0; -static uint8_t tempBlock[16*16]; //used so the horizontal code gets aligned data +static uint8_t tempBlocks[8*16*2]; //used for the horizontal code int hFlatnessThreshold= 56 - 16; int vFlatnessThreshold= 56 - 16; @@ -277,6 +276,7 @@ "movd %%mm0, %0 \n\t" : "=r" (numEq) : "r" (src), "r" (stride) + : "%eax", "%ebx" ); numEq= (256 - numEq) &0xFF; @@ -850,7 +850,7 @@ } } -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) +#if 0 asm volatile( "pxor %%mm7, %%mm7 \n\t" // 0 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE @@ -1295,13 +1295,13 @@ //FIXME? |255-0| = 1 /** - * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock. + * Check if the given 8x8 Block is mostly "flat" */ -static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride) +static inline int isHorizDC(uint8_t src[], int stride) { // src++; int numEq= 0; -#ifdef HAVE_MMX +#if 0 asm volatile ( // "int $3 \n\t" "leal (%1, %2), %%ecx \n\t" @@ -1386,14 +1386,6 @@ if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++; if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++; if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++; - tempBlock[0 + y*TEMP_STRIDE] = src[0]; - tempBlock[1 + y*TEMP_STRIDE] = src[1]; - tempBlock[2 + y*TEMP_STRIDE] = src[2]; - tempBlock[3 + y*TEMP_STRIDE] = src[3]; - tempBlock[4 + y*TEMP_STRIDE] = src[4]; - tempBlock[5 + y*TEMP_STRIDE] = src[5]; - tempBlock[6 + y*TEMP_STRIDE] = src[6]; - tempBlock[7 + y*TEMP_STRIDE] = src[7]; src+= stride; } #endif @@ -1416,40 +1408,14 @@ static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) { -#ifdef MMX_FIXME -FIXME - int isOk; - asm volatile( -// "int $3 \n\t" - "movq (%1, %2), %%mm0 \n\t" - "movq (%1, %2, 8), %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "psubusb %%mm1, %%mm0 \n\t" - "psubusb %%mm2, %%mm1 \n\t" - "por %%mm1, %%mm0 \n\t" // ABS Diff - - "movq pQPb, %%mm7 \n\t" // QP,..., QP - "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP - "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 - "pcmpeqd b00, %%mm0 \n\t" - "psrlq $16, %%mm0 \n\t" - "pcmpeqd bFF, %%mm0 \n\t" -// "movd %%mm0, (%1, %2, 4)\n\t" - "movd %%mm0, %0 \n\t" - : "=r" (isOk) - : "r" (src), "r" (stride) - ); - return isOk; -#else if(abs(src[0] - src[7]) > 2*QP) return 0; return 1; -#endif } -static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) +static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP) { -#ifdef HAVE_MMX +#if 0 asm volatile( "leal (%0, %1), %%ecx \n\t" "leal (%%ecx, %1, 4), %%ebx \n\t" @@ -1536,27 +1502,16 @@ : "%eax", "%ebx", "%ecx" ); #else - uint8_t *src= tempBlock; - int y; for(y=0; y>4; dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4; @@ -1830,12 +1783,10 @@ dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4; dst+= stride; - temp+= TEMP_STRIDE; } #endif } - static inline void dering(uint8_t src[], int stride, int QP) { //FIXME @@ -2185,6 +2136,171 @@ #endif } +/** + * transposes and shift the given 8x8 Block into dst1 and dst2 + */ +static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) +{ + asm( + "leal (%0, %1), %%eax \n\t" + "leal (%%eax, %1, 4), %%ebx \n\t" +// 0 1 2 3 4 5 6 7 8 9 +// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 + "movq (%0), %%mm0 \n\t" // 12345678 + "movq (%%eax), %%mm1 \n\t" // abcdefgh + "movq %%mm0, %%mm2 \n\t" // 12345678 + "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d + "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h + + "movq (%%eax, %1), %%mm1 \n\t" + "movq (%%eax, %1, 2), %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "punpcklbw %%mm3, %%mm1 \n\t" + "punpckhbw %%mm3, %%mm4 \n\t" + + "movq %%mm0, %%mm3 \n\t" + "punpcklwd %%mm1, %%mm0 \n\t" + "punpckhwd %%mm1, %%mm3 \n\t" + "movq %%mm2, %%mm1 \n\t" + "punpcklwd %%mm4, %%mm2 \n\t" + "punpckhwd %%mm4, %%mm1 \n\t" + + "movd %%mm0, 128(%2) \n\t" + "psrlq $32, %%mm0 \n\t" + "movd %%mm0, 144(%2) \n\t" + "movd %%mm3, 160(%2) \n\t" + "psrlq $32, %%mm3 \n\t" + "movd %%mm3, 176(%2) \n\t" + "movd %%mm3, 48(%3) \n\t" + "movd %%mm2, 192(%2) \n\t" + "movd %%mm2, 64(%3) \n\t" + "psrlq $32, %%mm2 \n\t" + "movd %%mm2, 80(%3) \n\t" + "movd %%mm1, 96(%3) \n\t" + "psrlq $32, %%mm1 \n\t" + "movd %%mm1, 112(%3) \n\t" + + "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 + "movq (%%ebx), %%mm1 \n\t" // abcdefgh + "movq %%mm0, %%mm2 \n\t" // 12345678 + "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d + "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h + + "movq (%%ebx, %1), %%mm1 \n\t" + "movq (%%ebx, %1, 2), %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "punpcklbw %%mm3, %%mm1 \n\t" + "punpckhbw %%mm3, %%mm4 \n\t" + + "movq %%mm0, %%mm3 \n\t" + "punpcklwd %%mm1, %%mm0 \n\t" + "punpckhwd %%mm1, %%mm3 \n\t" + "movq %%mm2, %%mm1 \n\t" + "punpcklwd %%mm4, %%mm2 \n\t" + "punpckhwd %%mm4, %%mm1 \n\t" + + "movd %%mm0, 132(%2) \n\t" + "psrlq $32, %%mm0 \n\t" + "movd %%mm0, 148(%2) \n\t" + "movd %%mm3, 164(%2) \n\t" + "psrlq $32, %%mm3 \n\t" + "movd %%mm3, 180(%2) \n\t" + "movd %%mm3, 52(%3) \n\t" + "movd %%mm2, 196(%2) \n\t" + "movd %%mm2, 68(%3) \n\t" + "psrlq $32, %%mm2 \n\t" + "movd %%mm2, 84(%3) \n\t" + "movd %%mm1, 100(%3) \n\t" + "psrlq $32, %%mm1 \n\t" + "movd %%mm1, 116(%3) \n\t" + + + :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) + : "%eax", "%ebx" + ); +} + +/** + * transposes the given 8x8 block + */ +static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src) +{ + asm( + "leal (%0, %1), %%eax \n\t" + "leal (%%eax, %1, 4), %%ebx \n\t" +// 0 1 2 3 4 5 6 7 8 9 +// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 + "movq (%2), %%mm0 \n\t" // 12345678 + "movq 16(%2), %%mm1 \n\t" // abcdefgh + "movq %%mm0, %%mm2 \n\t" // 12345678 + "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d + "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h + + "movq 32(%2), %%mm1 \n\t" + "movq 48(%2), %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "punpcklbw %%mm3, %%mm1 \n\t" + "punpckhbw %%mm3, %%mm4 \n\t" + + "movq %%mm0, %%mm3 \n\t" + "punpcklwd %%mm1, %%mm0 \n\t" + "punpckhwd %%mm1, %%mm3 \n\t" + "movq %%mm2, %%mm1 \n\t" + "punpcklwd %%mm4, %%mm2 \n\t" + "punpckhwd %%mm4, %%mm1 \n\t" + + "movd %%mm0, (%0) \n\t" + "psrlq $32, %%mm0 \n\t" + "movd %%mm0, (%%eax) \n\t" + "movd %%mm3, (%%eax, %1) \n\t" + "psrlq $32, %%mm3 \n\t" + "movd %%mm3, (%%eax, %1, 2) \n\t" + "movd %%mm2, (%0, %1, 4) \n\t" + "psrlq $32, %%mm2 \n\t" + "movd %%mm2, (%%ebx) \n\t" + "movd %%mm1, (%%ebx, %1) \n\t" + "psrlq $32, %%mm1 \n\t" + "movd %%mm1, (%%ebx, %1, 2) \n\t" + + + "movq 64(%2), %%mm0 \n\t" // 12345678 + "movq 80(%2), %%mm1 \n\t" // abcdefgh + "movq %%mm0, %%mm2 \n\t" // 12345678 + "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d + "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h + + "movq 96(%2), %%mm1 \n\t" + "movq 112(%2), %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "punpcklbw %%mm3, %%mm1 \n\t" + "punpckhbw %%mm3, %%mm4 \n\t" + + "movq %%mm0, %%mm3 \n\t" + "punpcklwd %%mm1, %%mm0 \n\t" + "punpckhwd %%mm1, %%mm3 \n\t" + "movq %%mm2, %%mm1 \n\t" + "punpcklwd %%mm4, %%mm2 \n\t" + "punpckhwd %%mm4, %%mm1 \n\t" + + "movd %%mm0, 4(%0) \n\t" + "psrlq $32, %%mm0 \n\t" + "movd %%mm0, 4(%%eax) \n\t" + "movd %%mm3, 4(%%eax, %1) \n\t" + "psrlq $32, %%mm3 \n\t" + "movd %%mm3, 4(%%eax, %1, 2) \n\t" + "movd %%mm2, 4(%0, %1, 4) \n\t" + "psrlq $32, %%mm2 \n\t" + "movd %%mm2, 4(%%ebx) \n\t" + "movd %%mm1, 4(%%ebx, %1) \n\t" + "psrlq $32, %%mm1 \n\t" + "movd %%mm1, 4(%%ebx, %1, 2) \n\t" + + :: "r" (dst), "r" (dstStride), "r" (src) + : "%eax", "%ebx" + ); +} + + #ifdef HAVE_ODIVX_POSTPROCESS #include "../opendivx/postprocess.h" int use_old_pp=0; @@ -2710,6 +2826,8 @@ int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4); int QPFrac= QPDelta; + uint8_t *tempBlock1= tempBlocks; + uint8_t *tempBlock2= tempBlocks + 8; #endif /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not than use a temporary buffer */ @@ -2742,6 +2860,7 @@ for(x=0; x= 0) { #ifdef MORE_TIMING T0= rdtsc(); #endif +#ifdef HAVE_MMX + if(mode & H_RK1_FILTER) + vertRK1Filter(tempBlock1, 16, QP); + else if(mode & H_X1_FILTER) + vertX1Filter(tempBlock1, 16, QP); + else if(mode & H_DEBLOCK) + { + if( isVertDC(tempBlock1, 16)) + { + if(isVertMinMaxOk(tempBlock1, 16, QP)) + doVertLowPass(tempBlock1, 16, QP); + } + else + doVertDefFilter(tempBlock1, 16, QP); + } + + transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16); + +#else if(mode & H_X1_FILTER) horizX1Filter(dstBlock-4, stride, QP); else if(mode & H_DEBLOCK) { - if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) + if( isHorizDC(dstBlock-4, stride)) { - if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) - doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); + if(isHorizMinMaxOk(dstBlock-4, stride, QP)) + doHorizLowPass(dstBlock-4, stride, QP); } else - doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); + doHorizDefFilter(dstBlock-4, stride, QP); } +#endif #ifdef MORE_TIMING T1= rdtsc(); horizTime+= T1-T0; @@ -2929,6 +3070,10 @@ dstBlock+=8; srcBlock+=8; + + tmpXchg= tempBlock1; + tempBlock1= tempBlock2; + tempBlock2 = tmpXchg; } /* did we use a tmp buffer */