Mercurial > libavcodec.hg
changeset 167:2d97f0157a79 libavcodec
faster dering
author | michael |
---|---|
date | Sat, 24 Nov 2001 01:38:30 +0000 |
parents | ec349ac7869b |
children | 712c7a115164 |
files | libpostproc/postprocess.c libpostproc/postprocess_template.c |
diffstat | 2 files changed, 238 insertions(+), 114 deletions(-) [+] |
line wrap: on
line diff
--- a/libpostproc/postprocess.c Tue Nov 20 20:46:39 2001 +0000 +++ b/libpostproc/postprocess.c Sat Nov 24 01:38:30 2001 +0000 @@ -47,7 +47,6 @@ /* TODO: -verify that everything workes as it should (how?) reduce the time wasted on the mem transfer implement everything in C at least (done at the moment but ...) unroll stuff if instructions depend too much on the prior one @@ -62,7 +61,8 @@ optimize c versions try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks smart blur -commandline option for the deblock thresholds +commandline option for the deblock / dering thresholds +memcpy chrominance if no chroma filtering is done ... */ @@ -162,6 +162,7 @@ int hFlatnessThreshold= 56 - 16; int vFlatnessThreshold= 56 - 16; +int deringThreshold= 20; //amount of "black" u r willing to loose to get a brightness corrected picture double maxClippedThreshold= 0.01; @@ -310,28 +311,26 @@ "paddb %%mm2, %%mm0 \n\t" " \n\t" +#ifdef HAVE_MMX2 + "pxor %%mm7, %%mm7 \n\t" + "psadbw %%mm7, %%mm0 \n\t" +#else "movq %%mm0, %%mm1 \n\t" "psrlw $8, %%mm0 \n\t" "paddb %%mm1, %%mm0 \n\t" -#ifdef HAVE_MMX2 - "pshufw $0xF9, %%mm0, %%mm1 \n\t" - "paddb %%mm1, %%mm0 \n\t" - "pshufw $0xFE, %%mm0, %%mm1 \n\t" -#else "movq %%mm0, %%mm1 \n\t" "psrlq $16, %%mm0 \n\t" "paddb %%mm1, %%mm0 \n\t" "movq %%mm0, %%mm1 \n\t" "psrlq $32, %%mm0 \n\t" + "paddb %%mm1, %%mm0 \n\t" #endif - "paddb %%mm1, %%mm0 \n\t" "movd %%mm0, %0 \n\t" : "=r" (numEq) : "r" (src), "r" (stride) - : "%eax", "%ebx" + : "%ebx" ); - - numEq= (256 - numEq) &0xFF; + numEq= (-numEq) &0xFF; #else for(y=0; y<BLOCK_SIZE-1; y++) @@ -1591,21 +1590,21 @@ // 0 1 2 3 4 5 6 7 8 9 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 - "pcmpeqb %%mm6, %%mm6 \n\t" - "pxor %%mm7, %%mm7 \n\t" + "pcmpeqb %%mm7, %%mm7 \n\t" + "pxor %%mm6, %%mm6 \n\t" #ifdef HAVE_MMX2 #define FIND_MIN_MAX(addr)\ "movq " #addr ", %%mm0 \n\t"\ - "pminub %%mm0, %%mm6 \n\t"\ - "pmaxub %%mm0, %%mm7 \n\t" + "pminub %%mm0, %%mm7 \n\t"\ + "pmaxub %%mm0, %%mm6 \n\t" #else #define FIND_MIN_MAX(addr)\ "movq " #addr ", %%mm0 \n\t"\ - "movq %%mm6, %%mm1 \n\t"\ - "psubusb %%mm0, %%mm7 \n\t"\ - "paddb %%mm0, %%mm7 \n\t"\ + "movq %%mm7, %%mm1 \n\t"\ + "psubusb %%mm0, %%mm6 \n\t"\ + "paddb %%mm0, %%mm6 \n\t"\ "psubusb %%mm0, %%mm1 \n\t"\ - "psubb %%mm1, %%mm6 \n\t" + "psubb %%mm1, %%mm7 \n\t" #endif FIND_MIN_MAX((%%eax)) @@ -1617,52 +1616,57 @@ FIND_MIN_MAX((%%ebx, %1, 2)) FIND_MIN_MAX((%0, %1, 8)) + "movq %%mm7, %%mm4 \n\t" + "psrlq $8, %%mm7 \n\t" +#ifdef HAVE_MMX2 + "pminub %%mm4, %%mm7 \n\t" // min of pixels + "pshufw $0xF9, %%mm7, %%mm4 \n\t" + "pminub %%mm4, %%mm7 \n\t" // min of pixels + "pshufw $0xFE, %%mm7, %%mm4 \n\t" + "pminub %%mm4, %%mm7 \n\t" +#else + "movq %%mm7, %%mm1 \n\t" + "psubusb %%mm4, %%mm1 \n\t" + "psubb %%mm1, %%mm7 \n\t" + "movq %%mm7, %%mm4 \n\t" + "psrlq $16, %%mm7 \n\t" + "movq %%mm7, %%mm1 \n\t" + "psubusb %%mm4, %%mm1 \n\t" + "psubb %%mm1, %%mm7 \n\t" + "movq %%mm7, %%mm4 \n\t" + "psrlq $32, %%mm7 \n\t" + "movq %%mm7, %%mm1 \n\t" + "psubusb %%mm4, %%mm1 \n\t" + "psubb %%mm1, %%mm7 \n\t" +#endif + + "movq %%mm6, %%mm4 \n\t" "psrlq $8, %%mm6 \n\t" #ifdef HAVE_MMX2 - "pminub %%mm4, %%mm6 \n\t" // min of pixels + "pmaxub %%mm4, %%mm6 \n\t" // max of pixels "pshufw $0xF9, %%mm6, %%mm4 \n\t" - "pminub %%mm4, %%mm6 \n\t" // min of pixels + "pmaxub %%mm4, %%mm6 \n\t" "pshufw $0xFE, %%mm6, %%mm4 \n\t" - "pminub %%mm4, %%mm6 \n\t" + "pmaxub %%mm4, %%mm6 \n\t" #else - "movq %%mm6, %%mm1 \n\t" - "psubusb %%mm4, %%mm1 \n\t" - "psubb %%mm1, %%mm6 \n\t" + "psubusb %%mm4, %%mm6 \n\t" + "paddb %%mm4, %%mm6 \n\t" "movq %%mm6, %%mm4 \n\t" "psrlq $16, %%mm6 \n\t" - "movq %%mm6, %%mm1 \n\t" - "psubusb %%mm4, %%mm1 \n\t" - "psubb %%mm1, %%mm6 \n\t" + "psubusb %%mm4, %%mm6 \n\t" + "paddb %%mm4, %%mm6 \n\t" "movq %%mm6, %%mm4 \n\t" "psrlq $32, %%mm6 \n\t" - "movq %%mm6, %%mm1 \n\t" - "psubusb %%mm4, %%mm1 \n\t" - "psubb %%mm1, %%mm6 \n\t" + "psubusb %%mm4, %%mm6 \n\t" + "paddb %%mm4, %%mm6 \n\t" #endif - - - "movq %%mm7, %%mm4 \n\t" - "psrlq $8, %%mm7 \n\t" -#ifdef HAVE_MMX2 - "pmaxub %%mm4, %%mm7 \n\t" // max of pixels - "pshufw $0xF9, %%mm7, %%mm4 \n\t" - "pmaxub %%mm4, %%mm7 \n\t" - "pshufw $0xFE, %%mm7, %%mm4 \n\t" - "pmaxub %%mm4, %%mm7 \n\t" -#else - "psubusb %%mm4, %%mm7 \n\t" - "paddb %%mm4, %%mm7 \n\t" - "movq %%mm7, %%mm4 \n\t" - "psrlq $16, %%mm7 \n\t" - "psubusb %%mm4, %%mm7 \n\t" - "paddb %%mm4, %%mm7 \n\t" - "movq %%mm7, %%mm4 \n\t" - "psrlq $32, %%mm7 \n\t" - "psubusb %%mm4, %%mm7 \n\t" - "paddb %%mm4, %%mm7 \n\t" -#endif - PAVGB(%%mm6, %%mm7) // a=(max + min)/2 + "movq %%mm6, %%mm0 \n\t" // max + "psubb %%mm7, %%mm6 \n\t" // max - min + "movd %%mm6, %%ecx \n\t" + "cmpb deringThreshold, %%cl \n\t" + " jb 1f \n\t" + PAVGB(%%mm0, %%mm7) // a=(max + min)/2 "punpcklbw %%mm7, %%mm7 \n\t" "punpcklbw %%mm7, %%mm7 \n\t" "punpcklbw %%mm7, %%mm7 \n\t" @@ -1785,9 +1789,9 @@ DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) - + "1: \n\t" : : "r" (src), "r" (stride), "r" (QP) - : "%eax", "%ebx" + : "%eax", "%ebx", "%ecx" ); #else int y; @@ -1810,6 +1814,8 @@ } avg= (min + max + 1)/2; + if(max - min <deringThreshold) return; + for(y=0; y<10; y++) { int x; @@ -1842,13 +1848,69 @@ +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); f= (f + 8)>>4; +#ifdef DEBUG_DERING_THRESHOLD + asm volatile("emms\n\t":); + { + static long long numPixels=0; + if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; +// if((max-min)<20 || (max-min)*QP<200) +// if((max-min)*QP < 500) +// if(max-min<QP/2) + if(max-min < 20) + { + static int numSkiped=0; + static int errorSum=0; + static int worstQP=0; + static int worstRange=0; + static int worstDiff=0; + int diff= (f - *p); + int absDiff= ABS(diff); + int error= diff*diff; + + if(x==1 || x==8 || y==1 || y==8) continue; + + numSkiped++; + if(absDiff > worstDiff) + { + worstDiff= absDiff; + worstQP= QP; + worstRange= max-min; + } + errorSum+= error; + + if(1024LL*1024LL*1024LL % numSkiped == 0) + { + printf( "sum:%1.3f, skip:%d, wQP:%d, " + "wRange:%d, wDiff:%d, relSkip:%1.3f\n", + (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, + worstDiff, (float)numSkiped/numPixels); + } + } + } +#endif if (*p + 2*QP < f) *p= *p + 2*QP; else if(*p - 2*QP > f) *p= *p - 2*QP; else *p=f; } } } - +#ifdef DEBUG_DERING_THRESHOLD + if(max-min < 20) + { + for(y=1; y<9; y++) + { + int x; + int t = 0; + p= src + stride*y; + for(x=1; x<9; x++) + { + p++; + *p = MIN(*p + 20, 255); + } + } +// src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; + } +#endif #endif }
--- a/libpostproc/postprocess_template.c Tue Nov 20 20:46:39 2001 +0000 +++ b/libpostproc/postprocess_template.c Sat Nov 24 01:38:30 2001 +0000 @@ -47,7 +47,6 @@ /* TODO: -verify that everything workes as it should (how?) reduce the time wasted on the mem transfer implement everything in C at least (done at the moment but ...) unroll stuff if instructions depend too much on the prior one @@ -62,7 +61,8 @@ optimize c versions try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks smart blur -commandline option for the deblock thresholds +commandline option for the deblock / dering thresholds +memcpy chrominance if no chroma filtering is done ... */ @@ -162,6 +162,7 @@ int hFlatnessThreshold= 56 - 16; int vFlatnessThreshold= 56 - 16; +int deringThreshold= 20; //amount of "black" u r willing to loose to get a brightness corrected picture double maxClippedThreshold= 0.01; @@ -310,28 +311,26 @@ "paddb %%mm2, %%mm0 \n\t" " \n\t" +#ifdef HAVE_MMX2 + "pxor %%mm7, %%mm7 \n\t" + "psadbw %%mm7, %%mm0 \n\t" +#else "movq %%mm0, %%mm1 \n\t" "psrlw $8, %%mm0 \n\t" "paddb %%mm1, %%mm0 \n\t" -#ifdef HAVE_MMX2 - "pshufw $0xF9, %%mm0, %%mm1 \n\t" - "paddb %%mm1, %%mm0 \n\t" - "pshufw $0xFE, %%mm0, %%mm1 \n\t" -#else "movq %%mm0, %%mm1 \n\t" "psrlq $16, %%mm0 \n\t" "paddb %%mm1, %%mm0 \n\t" "movq %%mm0, %%mm1 \n\t" "psrlq $32, %%mm0 \n\t" + "paddb %%mm1, %%mm0 \n\t" #endif - "paddb %%mm1, %%mm0 \n\t" "movd %%mm0, %0 \n\t" : "=r" (numEq) : "r" (src), "r" (stride) - : "%eax", "%ebx" + : "%ebx" ); - - numEq= (256 - numEq) &0xFF; + numEq= (-numEq) &0xFF; #else for(y=0; y<BLOCK_SIZE-1; y++) @@ -1591,21 +1590,21 @@ // 0 1 2 3 4 5 6 7 8 9 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 - "pcmpeqb %%mm6, %%mm6 \n\t" - "pxor %%mm7, %%mm7 \n\t" + "pcmpeqb %%mm7, %%mm7 \n\t" + "pxor %%mm6, %%mm6 \n\t" #ifdef HAVE_MMX2 #define FIND_MIN_MAX(addr)\ "movq " #addr ", %%mm0 \n\t"\ - "pminub %%mm0, %%mm6 \n\t"\ - "pmaxub %%mm0, %%mm7 \n\t" + "pminub %%mm0, %%mm7 \n\t"\ + "pmaxub %%mm0, %%mm6 \n\t" #else #define FIND_MIN_MAX(addr)\ "movq " #addr ", %%mm0 \n\t"\ - "movq %%mm6, %%mm1 \n\t"\ - "psubusb %%mm0, %%mm7 \n\t"\ - "paddb %%mm0, %%mm7 \n\t"\ + "movq %%mm7, %%mm1 \n\t"\ + "psubusb %%mm0, %%mm6 \n\t"\ + "paddb %%mm0, %%mm6 \n\t"\ "psubusb %%mm0, %%mm1 \n\t"\ - "psubb %%mm1, %%mm6 \n\t" + "psubb %%mm1, %%mm7 \n\t" #endif FIND_MIN_MAX((%%eax)) @@ -1617,52 +1616,57 @@ FIND_MIN_MAX((%%ebx, %1, 2)) FIND_MIN_MAX((%0, %1, 8)) + "movq %%mm7, %%mm4 \n\t" + "psrlq $8, %%mm7 \n\t" +#ifdef HAVE_MMX2 + "pminub %%mm4, %%mm7 \n\t" // min of pixels + "pshufw $0xF9, %%mm7, %%mm4 \n\t" + "pminub %%mm4, %%mm7 \n\t" // min of pixels + "pshufw $0xFE, %%mm7, %%mm4 \n\t" + "pminub %%mm4, %%mm7 \n\t" +#else + "movq %%mm7, %%mm1 \n\t" + "psubusb %%mm4, %%mm1 \n\t" + "psubb %%mm1, %%mm7 \n\t" + "movq %%mm7, %%mm4 \n\t" + "psrlq $16, %%mm7 \n\t" + "movq %%mm7, %%mm1 \n\t" + "psubusb %%mm4, %%mm1 \n\t" + "psubb %%mm1, %%mm7 \n\t" + "movq %%mm7, %%mm4 \n\t" + "psrlq $32, %%mm7 \n\t" + "movq %%mm7, %%mm1 \n\t" + "psubusb %%mm4, %%mm1 \n\t" + "psubb %%mm1, %%mm7 \n\t" +#endif + + "movq %%mm6, %%mm4 \n\t" "psrlq $8, %%mm6 \n\t" #ifdef HAVE_MMX2 - "pminub %%mm4, %%mm6 \n\t" // min of pixels + "pmaxub %%mm4, %%mm6 \n\t" // max of pixels "pshufw $0xF9, %%mm6, %%mm4 \n\t" - "pminub %%mm4, %%mm6 \n\t" // min of pixels + "pmaxub %%mm4, %%mm6 \n\t" "pshufw $0xFE, %%mm6, %%mm4 \n\t" - "pminub %%mm4, %%mm6 \n\t" + "pmaxub %%mm4, %%mm6 \n\t" #else - "movq %%mm6, %%mm1 \n\t" - "psubusb %%mm4, %%mm1 \n\t" - "psubb %%mm1, %%mm6 \n\t" + "psubusb %%mm4, %%mm6 \n\t" + "paddb %%mm4, %%mm6 \n\t" "movq %%mm6, %%mm4 \n\t" "psrlq $16, %%mm6 \n\t" - "movq %%mm6, %%mm1 \n\t" - "psubusb %%mm4, %%mm1 \n\t" - "psubb %%mm1, %%mm6 \n\t" + "psubusb %%mm4, %%mm6 \n\t" + "paddb %%mm4, %%mm6 \n\t" "movq %%mm6, %%mm4 \n\t" "psrlq $32, %%mm6 \n\t" - "movq %%mm6, %%mm1 \n\t" - "psubusb %%mm4, %%mm1 \n\t" - "psubb %%mm1, %%mm6 \n\t" + "psubusb %%mm4, %%mm6 \n\t" + "paddb %%mm4, %%mm6 \n\t" #endif - - - "movq %%mm7, %%mm4 \n\t" - "psrlq $8, %%mm7 \n\t" -#ifdef HAVE_MMX2 - "pmaxub %%mm4, %%mm7 \n\t" // max of pixels - "pshufw $0xF9, %%mm7, %%mm4 \n\t" - "pmaxub %%mm4, %%mm7 \n\t" - "pshufw $0xFE, %%mm7, %%mm4 \n\t" - "pmaxub %%mm4, %%mm7 \n\t" -#else - "psubusb %%mm4, %%mm7 \n\t" - "paddb %%mm4, %%mm7 \n\t" - "movq %%mm7, %%mm4 \n\t" - "psrlq $16, %%mm7 \n\t" - "psubusb %%mm4, %%mm7 \n\t" - "paddb %%mm4, %%mm7 \n\t" - "movq %%mm7, %%mm4 \n\t" - "psrlq $32, %%mm7 \n\t" - "psubusb %%mm4, %%mm7 \n\t" - "paddb %%mm4, %%mm7 \n\t" -#endif - PAVGB(%%mm6, %%mm7) // a=(max + min)/2 + "movq %%mm6, %%mm0 \n\t" // max + "psubb %%mm7, %%mm6 \n\t" // max - min + "movd %%mm6, %%ecx \n\t" + "cmpb deringThreshold, %%cl \n\t" + " jb 1f \n\t" + PAVGB(%%mm0, %%mm7) // a=(max + min)/2 "punpcklbw %%mm7, %%mm7 \n\t" "punpcklbw %%mm7, %%mm7 \n\t" "punpcklbw %%mm7, %%mm7 \n\t" @@ -1785,9 +1789,9 @@ DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) - + "1: \n\t" : : "r" (src), "r" (stride), "r" (QP) - : "%eax", "%ebx" + : "%eax", "%ebx", "%ecx" ); #else int y; @@ -1810,6 +1814,8 @@ } avg= (min + max + 1)/2; + if(max - min <deringThreshold) return; + for(y=0; y<10; y++) { int x; @@ -1842,13 +1848,69 @@ +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); f= (f + 8)>>4; +#ifdef DEBUG_DERING_THRESHOLD + asm volatile("emms\n\t":); + { + static long long numPixels=0; + if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; +// if((max-min)<20 || (max-min)*QP<200) +// if((max-min)*QP < 500) +// if(max-min<QP/2) + if(max-min < 20) + { + static int numSkiped=0; + static int errorSum=0; + static int worstQP=0; + static int worstRange=0; + static int worstDiff=0; + int diff= (f - *p); + int absDiff= ABS(diff); + int error= diff*diff; + + if(x==1 || x==8 || y==1 || y==8) continue; + + numSkiped++; + if(absDiff > worstDiff) + { + worstDiff= absDiff; + worstQP= QP; + worstRange= max-min; + } + errorSum+= error; + + if(1024LL*1024LL*1024LL % numSkiped == 0) + { + printf( "sum:%1.3f, skip:%d, wQP:%d, " + "wRange:%d, wDiff:%d, relSkip:%1.3f\n", + (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, + worstDiff, (float)numSkiped/numPixels); + } + } + } +#endif if (*p + 2*QP < f) *p= *p + 2*QP; else if(*p - 2*QP > f) *p= *p - 2*QP; else *p=f; } } } - +#ifdef DEBUG_DERING_THRESHOLD + if(max-min < 20) + { + for(y=1; y<9; y++) + { + int x; + int t = 0; + p= src + stride*y; + for(x=1; x<9; x++) + { + p++; + *p = MIN(*p + 20, 255); + } + } +// src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; + } +#endif #endif }