comparison libpostproc/postprocess_template.c @ 97:e57b1d38d71f libavcodec

bugfixes: last 3 lines not brightness/contrast corrected brightness statistics messed up with initial black pic changed initial values of the brightness statistics C++ -> C conversation QP range question solved (very likely 1<=QP<=32 according to arpi) new experimental vertical deblocking filter RK filter has 3dNow support now (untested)
author michael
date Thu, 11 Oct 2001 22:35:45 +0000
parents 29ac11dc53d3
children eaae16507d9b
comparison
equal deleted inserted replaced
96:29ac11dc53d3 97:e57b1d38d71f
25 isHorizDC Ec Ec 25 isHorizDC Ec Ec
26 isHorizMinMaxOk a 26 isHorizMinMaxOk a
27 doHorizLowPass E a a* 27 doHorizLowPass E a a*
28 doHorizDefFilter E ac ac 28 doHorizDefFilter E ac ac
29 deRing 29 deRing
30 RKAlgo1 E a a*
31 X1 a E E*
32
30 33
31 * i dont have a 3dnow CPU -> its untested 34 * i dont have a 3dnow CPU -> its untested
32 E = Exact implementation 35 E = Exact implementation
33 e = allmost exact implementation 36 e = allmost exact implementation
34 a = alternative / approximate impl 37 a = alternative / approximate impl
39 TODO: 42 TODO:
40 verify that everything workes as it should 43 verify that everything workes as it should
41 reduce the time wasted on the mem transfer 44 reduce the time wasted on the mem transfer
42 implement dering 45 implement dering
43 implement everything in C at least (done at the moment but ...) 46 implement everything in C at least (done at the moment but ...)
44 figure range of QP out (assuming <256 for now)
45 unroll stuff if instructions depend too much on the prior one 47 unroll stuff if instructions depend too much on the prior one
46 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? 48 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
47 move YScale thing to the end instead of fixing QP 49 move YScale thing to the end instead of fixing QP
48 write a faster and higher quality deblocking filter :) 50 write a faster and higher quality deblocking filter :)
51 do something about the speed of the horizontal filters
52 make the mainloop more flexible (variable number of blocks at once
53 (the if/else stuff per block is slowing things down)
49 ... 54 ...
50 55
51 Notes: 56 Notes:
52 57
53 */ 58 */
54 59
55 /* 60 /*
56 Changelog: 61 Changelog:
62 0.1.3
63 bugfixes: last 3 lines not brightness/contrast corrected
64 brightness statistics messed up with initial black pic
65 changed initial values of the brightness statistics
66 C++ -> C conversation
67 QP range question solved (very likely 1<=QP<=32 according to arpi)
68 new experimental vertical deblocking filter
69 RK filter has 3dNow support now (untested)
57 0.1.2 70 0.1.2
58 fixed a bug in the horizontal default filter 71 fixed a bug in the horizontal default filter
59 3dnow version of the Horizontal & Vertical Lowpass filters 72 3dnow version of the Horizontal & Vertical Lowpass filters
60 mmx version of the Horizontal Default filter 73 mmx version of the Horizontal Default filter
61 mmx2 & C versions of a simple filter described in a paper from ramkishor & karandikar 74 mmx2 & C versions of a simple filter described in a paper from ramkishor & karandikar
64 */ 77 */
65 78
66 79
67 #include <inttypes.h> 80 #include <inttypes.h>
68 #include <stdio.h> 81 #include <stdio.h>
82 #include <stdlib.h>
69 #include "../config.h" 83 #include "../config.h"
70 //#undef HAVE_MMX2 84 //#undef HAVE_MMX2
71 //#define HAVE_3DNOW 85 //#define HAVE_3DNOW
72 //#undef HAVE_MMX 86 //#undef HAVE_MMX
73 #include "postprocess.h" 87 #include "postprocess.h"
158 172
159 //FIXME? |255-0| = 1 (shouldnt be a problem ...) 173 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
160 /** 174 /**
161 * Check if the middle 8x8 Block in the given 8x10 block is flat 175 * Check if the middle 8x8 Block in the given 8x10 block is flat
162 */ 176 */
163 static inline bool isVertDC(uint8_t src[], int stride){ 177 static inline int isVertDC(uint8_t src[], int stride){
164 // return true; 178 // return true;
165 int numEq= 0; 179 int numEq= 0;
180 int y;
166 src+= stride; // src points to begin of the 8x8 Block 181 src+= stride; // src points to begin of the 8x8 Block
167 #ifdef HAVE_MMX 182 #ifdef HAVE_MMX
168 asm volatile( 183 asm volatile(
169 // "int $3 \n\t" 184 // "int $3 \n\t"
170 "pushl %1\n\t" 185 "pushl %1\n\t"
240 // int asmEq= numEq; 255 // int asmEq= numEq;
241 // numEq=0; 256 // numEq=0;
242 // uint8_t *temp= src; 257 // uint8_t *temp= src;
243 258
244 #else 259 #else
245 for(int y=0; y<BLOCK_SIZE-1; y++) 260 for(y=0; y<BLOCK_SIZE-1; y++)
246 { 261 {
247 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++; 262 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
248 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++; 263 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
249 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++; 264 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
250 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++; 265 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
266 } 281 }
267 printf("\n"); 282 printf("\n");
268 } 283 }
269 } 284 }
270 */ 285 */
271 return numEq > vFlatnessThreshold; 286 // for(int i=0; i<numEq/8; i++) src[i]=255;
287 return (numEq > vFlatnessThreshold) ? 1 : 0;
272 } 288 }
273 289
274 static inline bool isVertMinMaxOk(uint8_t src[], int stride, int QP) 290 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
275 { 291 {
276 #ifdef HAVE_MMX 292 #ifdef HAVE_MMX
277 int isOk; 293 int isOk;
278 asm volatile( 294 asm volatile(
279 // "int $3 \n\t" 295 // "int $3 \n\t"
293 // "movd %%mm0, (%1, %2, 4)\n\t" 309 // "movd %%mm0, (%1, %2, 4)\n\t"
294 "movd %%mm0, %0 \n\t" 310 "movd %%mm0, %0 \n\t"
295 : "=r" (isOk) 311 : "=r" (isOk)
296 : "r" (src), "r" (stride) 312 : "r" (src), "r" (stride)
297 ); 313 );
298 return isOk; 314 return isOk ? 1 : 0;
299 #else 315 #else
300 316
301 int isOk2= true; 317 int isOk2= 1;
302 for(int x=0; x<BLOCK_SIZE; x++) 318 int x;
319 for(x=0; x<BLOCK_SIZE; x++)
303 { 320 {
304 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=false; 321 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
305 } 322 }
306 /* if(isOk && !isOk2 || !isOk && isOk2) 323 /* if(isOk && !isOk2 || !isOk && isOk2)
307 { 324 {
308 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP); 325 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
309 for(int y=0; y<9; y++) 326 for(int y=0; y<9; y++)
482 const int l5= stride + l4; 499 const int l5= stride + l4;
483 const int l6= stride + l5; 500 const int l6= stride + l5;
484 const int l7= stride + l6; 501 const int l7= stride + l6;
485 const int l8= stride + l7; 502 const int l8= stride + l7;
486 const int l9= stride + l8; 503 const int l9= stride + l8;
487 504 int x;
488 for(int x=0; x<BLOCK_SIZE; x++) 505 for(x=0; x<BLOCK_SIZE; x++)
489 { 506 {
490 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; 507 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
491 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; 508 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
492 509
493 int sums[9]; 510 int sums[9];
527 x/8 = 1 544 x/8 = 1
528 1 12 12 23 545 1 12 12 23
529 */ 546 */
530 static inline void vertRKFilter(uint8_t *src, int stride, int QP) 547 static inline void vertRKFilter(uint8_t *src, int stride, int QP)
531 { 548 {
532 #ifdef HAVE_MMX2 549 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
533 // FIXME rounding 550 // FIXME rounding
534 asm volatile( 551 asm volatile(
535 "pxor %%mm7, %%mm7 \n\t" // 0 552 "pxor %%mm7, %%mm7 \n\t" // 0
536 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE 553 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
537 "leal (%0, %1), %%eax \n\t" 554 "leal (%0, %1), %%eax \n\t"
547 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 564 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
548 "movq (%%ebx), %%mm3 \n\t" // line 5 565 "movq (%%ebx), %%mm3 \n\t" // line 5
549 "movq %%mm2, %%mm4 \n\t" // line 4 566 "movq %%mm2, %%mm4 \n\t" // line 4
550 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 567 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
551 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 568 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
552 "pavgb %%mm3, %%mm5 \n\t" 569 PAVGB(%%mm3, %%mm5)
553 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 570 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
554 "psubusb %%mm3, %%mm4 \n\t" 571 "psubusb %%mm3, %%mm4 \n\t"
555 "psubusb %%mm2, %%mm3 \n\t" 572 "psubusb %%mm2, %%mm3 \n\t"
556 "por %%mm3, %%mm4 \n\t" // |l4 - l5| 573 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
557 "psubusb %%mm0, %%mm4 \n\t" 574 "psubusb %%mm0, %%mm4 \n\t"
598 const int l5= stride + l4; 615 const int l5= stride + l4;
599 const int l6= stride + l5; 616 const int l6= stride + l5;
600 const int l7= stride + l6; 617 const int l7= stride + l6;
601 const int l8= stride + l7; 618 const int l8= stride + l7;
602 const int l9= stride + l8; 619 const int l9= stride + l8;
603 for(int x=0; x<BLOCK_SIZE; x++) 620 int x;
621 for(x=0; x<BLOCK_SIZE; x++)
604 { 622 {
605 if(ABS(src[l4]-src[l5]) < QP + QP/4) 623 if(ABS(src[l4]-src[l5]) < QP + QP/4)
606 { 624 {
607 int x = src[l5] - src[l4]; 625 int v = (src[l5] - src[l4]);
608 626
609 src[l3] +=x/8; 627 src[l3] +=v/8;
610 src[l4] +=x/2; 628 src[l4] +=v/2;
611 src[l5] -=x/2; 629 src[l5] -=v/2;
612 src[l6] -=x/8; 630 src[l6] -=v/8;
631
613 } 632 }
614 src++; 633 src++;
615 } 634 }
616 635
617 #endif 636 #endif
618 } 637 }
619 638
620 /** 639 /**
621 * Experimental Filter 1 640 * Experimental Filter 1
641 * will nor damage linear gradients
642 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
643 * MMX2 version does correct clipping C version doesnt
622 */ 644 */
623 static inline void vertX1Filter(uint8_t *src, int stride, int QP) 645 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
624 { 646 {
625 #ifdef HAVE_MMX2X 647 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
626 // FIXME
627 asm volatile( 648 asm volatile(
649 "pxor %%mm7, %%mm7 \n\t" // 0
650 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
651 "leal (%0, %1), %%eax \n\t"
652 "leal (%%eax, %1, 4), %%ebx \n\t"
653 // 0 1 2 3 4 5 6 7 8 9
654 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
655 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
656 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
657 "movq %%mm1, %%mm2 \n\t" // line 4
658 "psubusb %%mm0, %%mm1 \n\t"
659 "psubusb %%mm2, %%mm0 \n\t"
660 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
661 "movq (%%ebx), %%mm3 \n\t" // line 5
662 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
663 "movq %%mm3, %%mm5 \n\t" // line 5
664 "psubusb %%mm4, %%mm3 \n\t"
665 "psubusb %%mm5, %%mm4 \n\t"
666 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
667 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
668 "movq %%mm2, %%mm1 \n\t" // line 4
669 "psubusb %%mm5, %%mm2 \n\t"
670 "movq %%mm2, %%mm4 \n\t"
671 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
672 "psubusb %%mm1, %%mm5 \n\t"
673 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
674 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
675 "movq %%mm4, %%mm3 \n\t" // d
676 "psubusb pQPb, %%mm4 \n\t"
677 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
678 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
679
680 PAVGB(%%mm7, %%mm3) // d/2
681
682 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
683 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
684 "psubusb %%mm3, %%mm0 \n\t"
685 "pxor %%mm2, %%mm0 \n\t"
686 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
687
688 "movq (%%ebx), %%mm0 \n\t" // line 5
689 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
690 "paddusb %%mm3, %%mm0 \n\t"
691 "pxor %%mm2, %%mm0 \n\t"
692 "movq %%mm0, (%%ebx) \n\t" // line 5
693
694 PAVGB(%%mm7, %%mm3) // d/4
695
696 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
697 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
698 "psubusb %%mm3, %%mm0 \n\t"
699 "pxor %%mm2, %%mm0 \n\t"
700 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
701
702 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
703 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
704 "paddusb %%mm3, %%mm0 \n\t"
705 "pxor %%mm2, %%mm0 \n\t"
706 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
707
708 PAVGB(%%mm7, %%mm3) // d/8
709
710 "movq (%%eax, %1), %%mm0 \n\t" // line 2
711 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
712 "psubusb %%mm3, %%mm0 \n\t"
713 "pxor %%mm2, %%mm0 \n\t"
714 "movq %%mm0, (%%eax, %1) \n\t" // line 2
715
716 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
717 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
718 "paddusb %%mm3, %%mm0 \n\t"
719 "pxor %%mm2, %%mm0 \n\t"
720 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
628 721
629 : 722 :
630 : "r" (src), "r" (stride) 723 : "r" (src), "r" (stride)
631 : "%eax", "%ebx" 724 : "%eax", "%ebx"
632 ); 725 );
633 #else 726 #else
727
728 const int l1= stride;
729 const int l2= stride + l1;
730 const int l3= stride + l2;
731 const int l4= stride + l3;
732 const int l5= stride + l4;
733 const int l6= stride + l5;
734 const int l7= stride + l6;
735 const int l8= stride + l7;
736 const int l9= stride + l8;
737 int x;
738 for(x=0; x<BLOCK_SIZE; x++)
739 {
740 int a= src[l3] - src[l4];
741 int b= src[l4] - src[l5];
742 int c= src[l6] - src[l7];
743
744 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
745
746 if(d < QP)
747 {
748 int v = d * SIGN(-b);
749
750 src[l2] +=v/8;
751 src[l3] +=v/4;
752 src[l4] +=v/2;
753 src[l5] -=v/2;
754 src[l6] -=v/4;
755 src[l7] -=v/8;
756
757 }
758 src++;
759 }
760 /*
634 const int l1= stride; 761 const int l1= stride;
635 const int l2= stride + l1; 762 const int l2= stride + l1;
636 const int l3= stride + l2; 763 const int l3= stride + l2;
637 const int l4= stride + l3; 764 const int l4= stride + l3;
638 const int l5= stride + l4; 765 const int l5= stride + l4;
656 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16; 783 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
657 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16; 784 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
658 } 785 }
659 src++; 786 src++;
660 } 787 }
661 788 */
662 #endif 789 #endif
663 } 790 }
664 791
665 792
666 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) 793 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
906 const int l5= stride + l4; 1033 const int l5= stride + l4;
907 const int l6= stride + l5; 1034 const int l6= stride + l5;
908 const int l7= stride + l6; 1035 const int l7= stride + l6;
909 const int l8= stride + l7; 1036 const int l8= stride + l7;
910 // const int l9= stride + l8; 1037 // const int l9= stride + l8;
911 1038 int x;
912 for(int x=0; x<BLOCK_SIZE; x++) 1039 for(x=0; x<BLOCK_SIZE; x++)
913 { 1040 {
914 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); 1041 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
915 if(ABS(middleEnergy) < 8*QP) 1042 if(ABS(middleEnergy) < 8*QP)
916 { 1043 {
917 const int q=(src[l4] - src[l5])/2; 1044 const int q=(src[l4] - src[l5])/2;
945 1072
946 //FIXME? |255-0| = 1 1073 //FIXME? |255-0| = 1
947 /** 1074 /**
948 * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock. 1075 * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
949 */ 1076 */
950 static inline bool isHorizDCAndCopy2Temp(uint8_t src[], int stride) 1077 static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
951 { 1078 {
952 // src++; 1079 // src++;
953 int numEq= 0; 1080 int numEq= 0;
954 #ifdef HAVE_MMX 1081 #ifdef HAVE_MMX
955 asm volatile ( 1082 asm volatile (
1005 : "%eax" 1132 : "%eax"
1006 ); 1133 );
1007 // printf("%d\n", numEq); 1134 // printf("%d\n", numEq);
1008 numEq= (256 - (numEq & 0xFF)) &0xFF; 1135 numEq= (256 - (numEq & 0xFF)) &0xFF;
1009 #else 1136 #else
1010 for(int y=0; y<BLOCK_SIZE; y++) 1137 int y;
1138 for(y=0; y<BLOCK_SIZE; y++)
1011 { 1139 {
1012 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++; 1140 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1013 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++; 1141 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1014 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++; 1142 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1015 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++; 1143 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1042 */ 1170 */
1043 // printf("%d\n", numEq); 1171 // printf("%d\n", numEq);
1044 return numEq > hFlatnessThreshold; 1172 return numEq > hFlatnessThreshold;
1045 } 1173 }
1046 1174
1047 static inline bool isHorizMinMaxOk(uint8_t src[], int stride, int QP) 1175 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1048 { 1176 {
1049 #ifdef MMX_FIXME 1177 #ifdef MMX_FIXME
1050 FIXME 1178 FIXME
1051 int isOk; 1179 int isOk;
1052 asm volatile( 1180 asm volatile(
1069 : "=r" (isOk) 1197 : "=r" (isOk)
1070 : "r" (src), "r" (stride) 1198 : "r" (src), "r" (stride)
1071 ); 1199 );
1072 return isOk; 1200 return isOk;
1073 #else 1201 #else
1074 if(abs(src[0] - src[7]) > 2*QP) return false; 1202 if(abs(src[0] - src[7]) > 2*QP) return 0;
1075 1203
1076 return true; 1204 return 1;
1077 #endif 1205 #endif
1078 } 1206 }
1079 1207
1080 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) 1208 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP)
1081 { 1209 {
1171 : "%eax" 1299 : "%eax"
1172 ); 1300 );
1173 #else 1301 #else
1174 uint8_t *src= tempBlock; 1302 uint8_t *src= tempBlock;
1175 1303
1176 for(int y=0; y<BLOCK_SIZE; y++) 1304 int y;
1305 for(y=0; y<BLOCK_SIZE; y++)
1177 { 1306 {
1178 dst[0] = src[0]; 1307 dst[0] = src[0];
1179 dst[1] = src[1]; 1308 dst[1] = src[1];
1180 dst[2] = src[2]; 1309 dst[2] = src[2];
1181 dst[3] = src[3]; 1310 dst[3] = src[3];
1373 : "%eax", "%ebx" 1502 : "%eax", "%ebx"
1374 ); 1503 );
1375 1504
1376 #else 1505 #else
1377 uint8_t *temp= tempBlock; 1506 uint8_t *temp= tempBlock;
1378 for(int y=0; y<BLOCK_SIZE; y++) 1507 int y;
1508 for(y=0; y<BLOCK_SIZE; y++)
1379 { 1509 {
1380 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0]; 1510 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1381 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; 1511 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1382 1512
1383 int sums[9]; 1513 int sums[9];
1500 while( (rdtsc() - T)/1000 < 4000); 1630 while( (rdtsc() - T)/1000 < 4000);
1501 1631
1502 return; 1632 return;
1503 */ 1633 */
1504 postProcess(src[0], src_stride, dst[0], dst_stride, 1634 postProcess(src[0], src_stride, dst[0], dst_stride,
1505 horizontal_size, vertical_size, QP_store, QP_stride, false, mode); 1635 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
1506 1636
1507 horizontal_size >>= 1; 1637 horizontal_size >>= 1;
1508 vertical_size >>= 1; 1638 vertical_size >>= 1;
1509 src_stride >>= 1; 1639 src_stride >>= 1;
1510 dst_stride >>= 1; 1640 dst_stride >>= 1;
1511 1641
1512 if(1) 1642 if(1)
1513 { 1643 {
1514 postProcess(src[1], src_stride, dst[1], dst_stride, 1644 postProcess(src[1], src_stride, dst[1], dst_stride,
1515 horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4); 1645 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
1516 postProcess(src[2], src_stride, dst[2], dst_stride, 1646 postProcess(src[2], src_stride, dst[2], dst_stride,
1517 horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4); 1647 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
1518 } 1648 }
1519 else 1649 else
1520 { 1650 {
1521 memcpy(dst[1], src[1], src_stride*horizontal_size); 1651 memcpy(dst[1], src[1], src_stride*horizontal_size);
1522 memcpy(dst[2], src[2], src_stride*horizontal_size); 1652 memcpy(dst[2], src[2], src_stride*horizontal_size);
1541 1671
1542 } // extern "C" 1672 } // extern "C"
1543 1673
1544 /** 1674 /**
1545 * Copies a block from src to dst and fixes the blacklevel 1675 * Copies a block from src to dst and fixes the blacklevel
1676 * numLines must be a multiple of 4
1677 * levelFix == 0 -> dont touch the brighness & contrast
1546 */ 1678 */
1547 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride) 1679 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
1680 int numLines, int levelFix)
1548 { 1681 {
1682 int i;
1683 if(levelFix)
1684 {
1549 #ifdef HAVE_MMX 1685 #ifdef HAVE_MMX
1550 asm volatile( 1686 asm volatile(
1687 "movl %4, %%eax \n\t"
1688 "movl %%eax, temp0\n\t"
1551 "pushl %0 \n\t" 1689 "pushl %0 \n\t"
1552 "pushl %1 \n\t" 1690 "pushl %1 \n\t"
1553 "leal (%2,%2), %%eax \n\t" 1691 "leal (%2,%2), %%eax \n\t"
1554 "leal (%3,%3), %%ebx \n\t" 1692 "leal (%3,%3), %%ebx \n\t"
1555 "movq packedYOffset, %%mm2 \n\t" 1693 "movq packedYOffset, %%mm2 \n\t"
1556 "movq packedYScale, %%mm3 \n\t" 1694 "movq packedYScale, %%mm3 \n\t"
1557
1558 #define SIMPLE_CPY \
1559 "movq (%0), %%mm0 \n\t"\
1560 "movq (%0,%2), %%mm1 \n\t"\
1561 "psubusb %%mm2, %%mm0 \n\t"\
1562 "psubusb %%mm2, %%mm1 \n\t"\
1563 "movq %%mm0, (%1) \n\t"\
1564 "movq %%mm1, (%1, %3) \n\t"\
1565 1695
1566 #define SCALED_CPY \ 1696 #define SCALED_CPY \
1567 "movq (%0), %%mm0 \n\t"\ 1697 "movq (%0), %%mm0 \n\t"\
1568 "movq (%0,%2), %%mm1 \n\t"\ 1698 "movq (%0,%2), %%mm1 \n\t"\
1569 "psubusb %%mm2, %%mm0 \n\t"\ 1699 "psubusb %%mm2, %%mm0 \n\t"\
1583 "pmulhuw %%mm3, %%mm4 \n\t"\ 1713 "pmulhuw %%mm3, %%mm4 \n\t"\
1584 "pmulhuw %%mm3, %%mm5 \n\t"\ 1714 "pmulhuw %%mm3, %%mm5 \n\t"\
1585 "packuswb %%mm5, %%mm4 \n\t"\ 1715 "packuswb %%mm5, %%mm4 \n\t"\
1586 "movq %%mm4, (%1, %3) \n\t"\ 1716 "movq %%mm4, (%1, %3) \n\t"\
1587 1717
1588 1718 "1: \n\t"
1589 #define CPY SCALED_CPY 1719 SCALED_CPY
1590 //#define CPY SIMPLE_CPY
1591 // "prefetchnta 8(%0)\n\t"
1592 CPY
1593 "addl %%eax, %0 \n\t" 1720 "addl %%eax, %0 \n\t"
1594 "addl %%ebx, %1 \n\t" 1721 "addl %%ebx, %1 \n\t"
1595 CPY 1722 SCALED_CPY
1596 "addl %%eax, %0 \n\t" 1723 "addl %%eax, %0 \n\t"
1597 "addl %%ebx, %1 \n\t" 1724 "addl %%ebx, %1 \n\t"
1598 CPY 1725 "decl temp0 \n\t"
1599 "addl %%eax, %0 \n\t" 1726 "jnz 1b \n\t"
1600 "addl %%ebx, %1 \n\t" 1727
1601 CPY
1602 "popl %1 \n\t" 1728 "popl %1 \n\t"
1603 "popl %0 \n\t" 1729 "popl %0 \n\t"
1604 : : "r" (src), 1730 : : "r" (src),
1605 "r" (dst), 1731 "r" (dst),
1606 "r" (srcStride), 1732 "r" (srcStride),
1607 "r" (dstStride) 1733 "r" (dstStride),
1734 "m" (numLines>>2)
1608 : "%eax", "%ebx" 1735 : "%eax", "%ebx"
1609 ); 1736 );
1610 #else 1737 #else
1611 for(int i=0; i<BLOCK_SIZE; i++) // last 10x8 Block is copied allready so +2 1738 for(i=0; i<numLines; i++)
1612 memcpy( &(dst[dstStride*i]), 1739 memcpy( &(dst[dstStride*i]),
1613 &(src[srcStride*i]), BLOCK_SIZE); 1740 &(src[srcStride*i]), BLOCK_SIZE);
1614 #endif 1741 #endif
1742 }
1743 else
1744 {
1745 #ifdef HAVE_MMX
1746 asm volatile(
1747 "movl %4, %%eax \n\t"
1748 "movl %%eax, temp0\n\t"
1749 "pushl %0 \n\t"
1750 "pushl %1 \n\t"
1751 "leal (%2,%2), %%eax \n\t"
1752 "leal (%3,%3), %%ebx \n\t"
1753 "movq packedYOffset, %%mm2 \n\t"
1754 "movq packedYScale, %%mm3 \n\t"
1755
1756 #define SIMPLE_CPY \
1757 "movq (%0), %%mm0 \n\t"\
1758 "movq (%0,%2), %%mm1 \n\t"\
1759 "movq %%mm0, (%1) \n\t"\
1760 "movq %%mm1, (%1, %3) \n\t"\
1761
1762 "1: \n\t"
1763 SIMPLE_CPY
1764 "addl %%eax, %0 \n\t"
1765 "addl %%ebx, %1 \n\t"
1766 SIMPLE_CPY
1767 "addl %%eax, %0 \n\t"
1768 "addl %%ebx, %1 \n\t"
1769 "decl temp0 \n\t"
1770 "jnz 1b \n\t"
1771
1772 "popl %1 \n\t"
1773 "popl %0 \n\t"
1774 : : "r" (src),
1775 "r" (dst),
1776 "r" (srcStride),
1777 "r" (dstStride),
1778 "m" (numLines>>2)
1779 : "%eax", "%ebx"
1780 );
1781 #else
1782 for(i=0; i<numLines; i++)
1783 memcpy( &(dst[dstStride*i]),
1784 &(src[srcStride*i]), BLOCK_SIZE);
1785 #endif
1786 }
1615 } 1787 }
1616 1788
1617 1789
1618 /** 1790 /**
1619 * Filters array of bytes (Y or U or V values) 1791 * Filters array of bytes (Y or U or V values)
1620 */ 1792 */
1621 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 1793 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
1622 QP_STORE_T QPs[], int QPStride, bool isColor, int mode) 1794 QP_STORE_T QPs[], int QPStride, int isColor, int mode)
1623 { 1795 {
1796 int x,y;
1797 /* we need 64bit here otherwise weŽll going to have a problem
1798 after watching a black picture for 5 hours*/
1799 static uint64_t *yHistogram= NULL;
1800 int black=0, white=255; // blackest black and whitest white in the picture
1624 1801
1625 #ifdef TIMEING 1802 #ifdef TIMEING
1626 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; 1803 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
1627 sumTime= rdtsc(); 1804 sumTime= rdtsc();
1628 #endif 1805 #endif
1629 1806
1630 /* we need 64bit here otherwise weŽll going to have a problem
1631 after watching a black picture for 5 hours*/
1632 static uint64_t *yHistogram= NULL;
1633 if(!yHistogram) 1807 if(!yHistogram)
1634 { 1808 {
1635 yHistogram= new uint64_t[256]; 1809 int i;
1636 for(int i=0; i<256; i++) yHistogram[i]= width*height/64/256; 1810 yHistogram= (uint64_t*)malloc(8*256);
1811 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
1637 } 1812 }
1638 1813
1639 int black=0, white=255; // blackest black and whitest white in the picture
1640 if(!isColor) 1814 if(!isColor)
1641 { 1815 {
1642 uint64_t sum= 0; 1816 uint64_t sum= 0;
1643 for(int i=0; i<256; i++) 1817 int i;
1818 static int framenum= -1;
1819 uint64_t maxClipped;
1820 uint64_t clipped;
1821 double scale;
1822
1823 framenum++;
1824 if(framenum == 1) yHistogram[0]= width*height/64*15/256;
1825
1826 for(i=0; i<256; i++)
1827 {
1644 sum+= yHistogram[i]; 1828 sum+= yHistogram[i];
1645 1829 // printf("%d ", yHistogram[i]);
1646 uint64_t maxClipped= (uint64_t)(sum * maxClippedThreshold); 1830 }
1647 1831 // printf("\n\n");
1648 uint64_t clipped= sum; 1832
1833 /* we allways get a completly black picture first */
1834
1835 maxClipped= (uint64_t)(sum * maxClippedThreshold);
1836
1837 clipped= sum;
1649 for(black=255; black>0; black--) 1838 for(black=255; black>0; black--)
1650 { 1839 {
1651 if(clipped < maxClipped) break; 1840 if(clipped < maxClipped) break;
1652 clipped-= yHistogram[black]; 1841 clipped-= yHistogram[black];
1653 } 1842 }
1663 packedYOffset= MAX(black - minAllowedY, 0); 1852 packedYOffset= MAX(black - minAllowedY, 0);
1664 packedYOffset|= packedYOffset<<32; 1853 packedYOffset|= packedYOffset<<32;
1665 packedYOffset|= packedYOffset<<16; 1854 packedYOffset|= packedYOffset<<16;
1666 packedYOffset|= packedYOffset<<8; 1855 packedYOffset|= packedYOffset<<8;
1667 1856
1668 double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black); 1857 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
1669 1858
1670 packedYScale= uint16_t(scale*256.0 + 0.5); 1859 packedYScale= (uint16_t)(scale*256.0 + 0.5);
1671 packedYScale|= packedYScale<<32; 1860 packedYScale|= packedYScale<<32;
1672 packedYScale|= packedYScale<<16; 1861 packedYScale|= packedYScale<<16;
1673 } 1862 }
1674 else 1863 else
1675 { 1864 {
1676 packedYScale= 0x0100010001000100LL; 1865 packedYScale= 0x0100010001000100LL;
1677 packedYOffset= 0; 1866 packedYOffset= 0;
1678 } 1867 }
1679 1868
1680 for(int x=0; x<width; x+=BLOCK_SIZE) 1869 for(x=0; x<width; x+=BLOCK_SIZE)
1681 blockCopy(dst + x, dstStride, src + x, srcStride); 1870 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
1682 1871
1683 for(int y=0; y<height; y+=BLOCK_SIZE) 1872 for(y=0; y<height; y+=BLOCK_SIZE)
1684 { 1873 {
1685 //1% speedup if these are here instead of the inner loop 1874 //1% speedup if these are here instead of the inner loop
1686 uint8_t *srcBlock= &(src[y*srcStride]); 1875 uint8_t *srcBlock= &(src[y*srcStride]);
1687 uint8_t *dstBlock= &(dst[y*dstStride]); 1876 uint8_t *dstBlock= &(dst[y*dstStride]);
1688 uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start 1877 uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start
1689 uint8_t *vertBlock= &(dstBlock[dstStride*3]); 1878 uint8_t *vertBlock= &(dstBlock[dstStride*3]);
1690 1879
1691 // finish 1 block before the next otherwise weŽll might have a problem 1880 // finish 1 block before the next otherwise weŽll might have a problem
1692 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing 1881 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
1693 for(int x=0; x<width; x+=BLOCK_SIZE) 1882 for(x=0; x<width; x+=BLOCK_SIZE)
1694 { 1883 {
1884 const int stride= dstStride;
1695 int QP= isColor ? 1885 int QP= isColor ?
1696 QPs[(y>>3)*QPStride + (x>>3)]: 1886 QPs[(y>>3)*QPStride + (x>>3)]:
1697 (QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8; 1887 (QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8;
1698 #ifdef HAVE_MMX 1888 #ifdef HAVE_MMX
1699 asm volatile( 1889 asm volatile(
1705 : : "r" (QP) 1895 : : "r" (QP)
1706 ); 1896 );
1707 #endif 1897 #endif
1708 1898
1709 1899
1710 const int stride= dstStride;
1711 if(y + 12 < height) 1900 if(y + 12 < height)
1712 { 1901 {
1713 #ifdef MORE_TIMEING 1902 #ifdef MORE_TIMEING
1714 T0= rdtsc(); 1903 T0= rdtsc();
1715 #endif 1904 #endif
1728 */ 1917 */
1729 #endif 1918 #endif
1730 if(!isColor) yHistogram[ srcBlock[0] ]++; 1919 if(!isColor) yHistogram[ srcBlock[0] ]++;
1731 1920
1732 blockCopy(vertBlock + dstStride*2, dstStride, 1921 blockCopy(vertBlock + dstStride*2, dstStride,
1733 vertSrcBlock + srcStride*2, srcStride); 1922 vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
1734 1923
1735 1924
1736 #ifdef MORE_TIMEING 1925 #ifdef MORE_TIMEING
1737 T1= rdtsc(); 1926 T1= rdtsc();
1738 memcpyTime+= T1-T0; 1927 memcpyTime+= T1-T0;
1740 #endif 1929 #endif
1741 if(mode & V_DEBLOCK) 1930 if(mode & V_DEBLOCK)
1742 { 1931 {
1743 if(mode & RK_FILTER) 1932 if(mode & RK_FILTER)
1744 vertRKFilter(vertBlock, stride, QP); 1933 vertRKFilter(vertBlock, stride, QP);
1745 else if(0) 1934 else if(mode & X1_FILTER)
1746 vertX1Filter(vertBlock, stride, QP); 1935 vertX1Filter(vertBlock, stride, QP);
1747 else 1936 else
1748 { 1937 {
1749 if( isVertDC(vertBlock, stride)) 1938 if( isVertDC(vertBlock, stride))
1750 { 1939 {
1760 vertTime+= T1-T0; 1949 vertTime+= T1-T0;
1761 T0=T1; 1950 T0=T1;
1762 #endif 1951 #endif
1763 } 1952 }
1764 else 1953 else
1765 { 1954 blockCopy(vertBlock + dstStride*1, dstStride,
1766 for(int i=2; i<BLOCK_SIZE/2+1; i++) // last 10x8 Block is copied allready so +2 1955 vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
1767 memcpy( &(vertBlock[dstStride*i]), 1956
1768 &(vertSrcBlock[srcStride*i]), BLOCK_SIZE);
1769
1770 }
1771 1957
1772 if(x - 8 >= 0 && x<width) 1958 if(x - 8 >= 0 && x<width)
1773 { 1959 {
1774 #ifdef MORE_TIMEING 1960 #ifdef MORE_TIMEING
1775 T0= rdtsc(); 1961 T0= rdtsc();
1811 #ifdef TIMEING 1997 #ifdef TIMEING
1812 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...) 1998 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
1813 sumTime= rdtsc() - sumTime; 1999 sumTime= rdtsc() - sumTime;
1814 if(!isColor) 2000 if(!isColor)
1815 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r", 2001 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
1816 int(memcpyTime/1000), int(vertTime/1000), int(horizTime/1000), 2002 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
1817 int(sumTime/1000), int((sumTime-memcpyTime-vertTime-horizTime)/1000) 2003 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
1818 , black, white); 2004 , black, white);
1819 #endif 2005 #endif
1820 } 2006 }
1821 2007
1822 2008