comparison libpostproc/postprocess_template.c @ 99:4f072fa99ccf libavcodec

fixed a rounding bug thing in the X1 Filter changed the X1 Filter slightly to make flat blocks look like in the 9tap lpf minor change to the -pp numbers & added decimal numbers in comments new experimental horizontal deblocking filter
author michael
date Sat, 13 Oct 2001 02:31:15 +0000
parents eaae16507d9b
children 1d1182345591
comparison
equal deleted inserted replaced
98:eaae16507d9b 99:4f072fa99ccf
25 isHorizDC Ec Ec 25 isHorizDC Ec Ec
26 isHorizMinMaxOk a 26 isHorizMinMaxOk a
27 doHorizLowPass E a a* 27 doHorizLowPass E a a*
28 doHorizDefFilter E ac ac 28 doHorizDefFilter E ac ac
29 deRing 29 deRing
30 RKAlgo1 E a a* 30 Vertical RKAlgo1 E a a*
31 X1 a E E* 31 Vertical X1 a E E*
32 Horizontal X1 a E E*
32 33
33 34
34 * i dont have a 3dnow CPU -> its untested 35 * i dont have a 3dnow CPU -> its untested
35 E = Exact implementation 36 E = Exact implementation
36 e = allmost exact implementation 37 e = allmost exact implementation
38 c = checked against the other implementations (-vo md5) 39 c = checked against the other implementations (-vo md5)
39 */ 40 */
40 41
41 /* 42 /*
42 TODO: 43 TODO:
43 verify that everything workes as it should 44 verify that everything workes as it should (how?)
44 reduce the time wasted on the mem transfer 45 reduce the time wasted on the mem transfer
45 implement dering 46 implement dering
46 implement everything in C at least (done at the moment but ...) 47 implement everything in C at least (done at the moment but ...)
47 unroll stuff if instructions depend too much on the prior one 48 unroll stuff if instructions depend too much on the prior one
48 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? 49 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
49 move YScale thing to the end instead of fixing QP 50 move YScale thing to the end instead of fixing QP
50 write a faster and higher quality deblocking filter :) 51 write a faster and higher quality deblocking filter :)
51 do something about the speed of the horizontal filters 52 do something about the speed of the horizontal filters
52 make the mainloop more flexible (variable number of blocks at once 53 make the mainloop more flexible (variable number of blocks at once
53 (the if/else stuff per block is slowing things down) 54 (the if/else stuff per block is slowing things down)
55 compare the quality & speed of all filters
56 implement a few simple deinterlacing filters
57 split this huge file
54 ... 58 ...
55 59
56 Notes: 60 Notes:
57 61
58 */ 62 */
59 63
60 /* 64 /*
61 Changelog: 65 Changelog: use the CVS log
62 0.1.3 66 0.1.3
63 bugfixes: last 3 lines not brightness/contrast corrected 67 bugfixes: last 3 lines not brightness/contrast corrected
64 brightness statistics messed up with initial black pic 68 brightness statistics messed up with initial black pic
65 changed initial values of the brightness statistics 69 changed initial values of the brightness statistics
66 C++ -> C conversation 70 C++ -> C conversation
97 static uint64_t bm00001000= 0x00000000FF000000LL; 101 static uint64_t bm00001000= 0x00000000FF000000LL;
98 static uint64_t bm10000000= 0xFF00000000000000LL; 102 static uint64_t bm10000000= 0xFF00000000000000LL;
99 static uint64_t bm10000001= 0xFF000000000000FFLL; 103 static uint64_t bm10000001= 0xFF000000000000FFLL;
100 static uint64_t bm11000011= 0xFFFF00000000FFFFLL; 104 static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
101 static uint64_t bm00000011= 0x000000000000FFFFLL; 105 static uint64_t bm00000011= 0x000000000000FFFFLL;
106 static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL;
102 static uint64_t bm11000000= 0xFFFF000000000000LL; 107 static uint64_t bm11000000= 0xFFFF000000000000LL;
103 static uint64_t bm00011000= 0x000000FFFF000000LL; 108 static uint64_t bm00011000= 0x000000FFFF000000LL;
104 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL; 109 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
105 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL; 110 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
106 static uint64_t b00= 0x0000000000000000LL; 111 static uint64_t b00= 0x0000000000000000LL;
112 static uint64_t b01= 0x0101010101010101LL;
107 static uint64_t b02= 0x0202020202020202LL; 113 static uint64_t b02= 0x0202020202020202LL;
108 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL; 114 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
109 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL; 115 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
110 static uint64_t b20= 0x2020202020202020LL; 116 static uint64_t b20= 0x2020202020202020LL;
111 static uint64_t b80= 0x8080808080808080LL; 117 static uint64_t b80= 0x8080808080808080LL;
542 x = 8 548 x = 8
543 x/2 = 4 549 x/2 = 4
544 x/8 = 1 550 x/8 = 1
545 1 12 12 23 551 1 12 12 23
546 */ 552 */
547 static inline void vertRKFilter(uint8_t *src, int stride, int QP) 553 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
548 { 554 {
549 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 555 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
550 // FIXME rounding 556 // FIXME rounding
551 asm volatile( 557 asm volatile(
552 "pxor %%mm7, %%mm7 \n\t" // 0 558 "pxor %%mm7, %%mm7 \n\t" // 0
636 #endif 642 #endif
637 } 643 }
638 644
639 /** 645 /**
640 * Experimental Filter 1 646 * Experimental Filter 1
641 * will nor damage linear gradients 647 * will not damage linear gradients
648 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
642 * can only smooth blocks at the expected locations (it cant smooth them if they did move) 649 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
643 * MMX2 version does correct clipping C version doesnt 650 * MMX2 version does correct clipping C version doesnt
644 */ 651 */
645 static inline void vertX1Filter(uint8_t *src, int stride, int QP) 652 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
646 { 653 {
673 "por %%mm5, %%mm4 \n\t" // |l4 - l5| 680 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
674 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) 681 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
675 "movq %%mm4, %%mm3 \n\t" // d 682 "movq %%mm4, %%mm3 \n\t" // d
676 "psubusb pQPb, %%mm4 \n\t" 683 "psubusb pQPb, %%mm4 \n\t"
677 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 684 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
685 "psubusb b01, %%mm3 \n\t"
678 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 686 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
679 687
680 PAVGB(%%mm7, %%mm3) // d/2 688 PAVGB(%%mm7, %%mm3) // d/2
689 "movq %%mm3, %%mm1 \n\t" // d/2
690 PAVGB(%%mm7, %%mm3) // d/4
691 PAVGB(%%mm1, %%mm3) // 3*d/8
681 692
682 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 693 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
683 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 694 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
684 "psubusb %%mm3, %%mm0 \n\t" 695 "psubusb %%mm3, %%mm0 \n\t"
685 "pxor %%mm2, %%mm0 \n\t" 696 "pxor %%mm2, %%mm0 \n\t"
689 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 700 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
690 "paddusb %%mm3, %%mm0 \n\t" 701 "paddusb %%mm3, %%mm0 \n\t"
691 "pxor %%mm2, %%mm0 \n\t" 702 "pxor %%mm2, %%mm0 \n\t"
692 "movq %%mm0, (%%ebx) \n\t" // line 5 703 "movq %%mm0, (%%ebx) \n\t" // line 5
693 704
694 PAVGB(%%mm7, %%mm3) // d/4 705 PAVGB(%%mm7, %%mm1) // d/4
695 706
696 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 707 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
697 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 708 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
698 "psubusb %%mm3, %%mm0 \n\t" 709 "psubusb %%mm1, %%mm0 \n\t"
699 "pxor %%mm2, %%mm0 \n\t" 710 "pxor %%mm2, %%mm0 \n\t"
700 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 711 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
701 712
702 "movq (%%ebx, %1), %%mm0 \n\t" // line 6 713 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
703 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 714 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
704 "paddusb %%mm3, %%mm0 \n\t" 715 "paddusb %%mm1, %%mm0 \n\t"
705 "pxor %%mm2, %%mm0 \n\t" 716 "pxor %%mm2, %%mm0 \n\t"
706 "movq %%mm0, (%%ebx, %1) \n\t" // line 6 717 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
707 718
708 PAVGB(%%mm7, %%mm3) // d/8 719 PAVGB(%%mm7, %%mm1) // d/8
709 720
710 "movq (%%eax, %1), %%mm0 \n\t" // line 2 721 "movq (%%eax, %1), %%mm0 \n\t" // line 2
711 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 722 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
712 "psubusb %%mm3, %%mm0 \n\t" 723 "psubusb %%mm1, %%mm0 \n\t"
713 "pxor %%mm2, %%mm0 \n\t" 724 "pxor %%mm2, %%mm0 \n\t"
714 "movq %%mm0, (%%eax, %1) \n\t" // line 2 725 "movq %%mm0, (%%eax, %1) \n\t" // line 2
715 726
716 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7 727 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
717 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 728 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
718 "paddusb %%mm3, %%mm0 \n\t" 729 "paddusb %%mm1, %%mm0 \n\t"
719 "pxor %%mm2, %%mm0 \n\t" 730 "pxor %%mm2, %%mm0 \n\t"
720 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7 731 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
721 732
722 : 733 :
723 : "r" (src), "r" (stride) 734 : "r" (src), "r" (stride)
737 int x; 748 int x;
738 for(x=0; x<BLOCK_SIZE; x++) 749 for(x=0; x<BLOCK_SIZE; x++)
739 { 750 {
740 int a= src[l3] - src[l4]; 751 int a= src[l3] - src[l4];
741 int b= src[l4] - src[l5]; 752 int b= src[l4] - src[l5];
742 int c= src[l6] - src[l7]; 753 int c= src[l5] - src[l6];
743 754
744 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); 755 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
745 756
746 if(d < QP) 757 if(d < QP)
747 { 758 {
748 int v = d * SIGN(-b); 759 int v = d * SIGN(-b);
749 760
750 src[l2] +=v/8; 761 src[l2] +=v/8;
751 src[l3] +=v/4; 762 src[l3] +=v/4;
752 src[l4] +=v/2; 763 src[l4] +=3*v/8;
753 src[l5] -=v/2; 764 src[l5] -=3*v/8;
754 src[l6] -=v/4; 765 src[l6] -=v/4;
755 src[l7] -=v/8; 766 src[l7] -=v/8;
756 767
757 } 768 }
758 src++; 769 src++;
784 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16; 795 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
785 } 796 }
786 src++; 797 src++;
787 } 798 }
788 */ 799 */
800 #endif
801 }
802
803 /**
804 * Experimental Filter 1 (Horizontal)
805 * will not damage linear gradients
806 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
807 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
808 * MMX2 version does correct clipping C version doesnt
809 * not identical with the vertical one
810 */
811 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
812 {
813 int y;
814 static uint64_t *lut= NULL;
815 if(lut==NULL)
816 {
817 int i;
818 lut= (uint64_t*)memalign(8, 256*8);
819 for(i=0; i<256; i++)
820 {
821 int v= i < 128 ? 2*i : 2*(i-256);
822 /*
823 //Simulate 112242211 9-Tap filter
824 uint64_t a= (v/16) & 0xFF;
825 uint64_t b= (v/8) & 0xFF;
826 uint64_t c= (v/4) & 0xFF;
827 uint64_t d= (3*v/8) & 0xFF;
828 */
829 //Simulate piecewise linear interpolation
830 uint64_t a= (v/16) & 0xFF;
831 uint64_t b= (v*3/16) & 0xFF;
832 uint64_t c= (v*5/16) & 0xFF;
833 uint64_t d= (7*v/16) & 0xFF;
834 uint64_t A= (0x100 - a)&0xFF;
835 uint64_t B= (0x100 - b)&0xFF;
836 uint64_t C= (0x100 - c)&0xFF;
837 uint64_t D= (0x100 - c)&0xFF;
838
839 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
840 (D<<24) | (C<<16) | (B<<8) | (A);
841 //lut[i] = (v<<32) | (v<<24);
842 }
843 }
844
845 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
846 asm volatile(
847 "pxor %%mm7, %%mm7 \n\t" // 0
848 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
849 "leal (%0, %1), %%eax \n\t"
850 "leal (%%eax, %1, 4), %%ebx \n\t"
851
852 "movq b80, %%mm6 \n\t"
853 "movd %2, %%mm5 \n\t" // QP
854 "movq %%mm5, %%mm4 \n\t"
855 "paddusb %%mm5, %%mm5 \n\t" // 2QP
856 "paddusb %%mm5, %%mm4 \n\t" // 3QP
857 "pxor %%mm5, %%mm5 \n\t" // 0
858 "psubb %%mm4, %%mm5 \n\t" // -3QP
859 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
860 "psllq $24, %%mm5 \n\t"
861
862 // 0 1 2 3 4 5 6 7 8 9
863 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
864
865 #define HX1old(a) \
866 "movd " #a ", %%mm0 \n\t"\
867 "movd 4" #a ", %%mm1 \n\t"\
868 "punpckldq %%mm1, %%mm0 \n\t"\
869 "movq %%mm0, %%mm1 \n\t"\
870 "movq %%mm0, %%mm2 \n\t"\
871 "psrlq $8, %%mm1 \n\t"\
872 "psubusb %%mm1, %%mm2 \n\t"\
873 "psubusb %%mm0, %%mm1 \n\t"\
874 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
875 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
876 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
877 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
878 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
879 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
880 "paddb %%mm5, %%mm1 \n\t"\
881 "psubusb %%mm5, %%mm1 \n\t"\
882 PAVGB(%%mm7, %%mm1)\
883 "pxor %%mm2, %%mm1 \n\t"\
884 "psubb %%mm2, %%mm1 \n\t"\
885 "psrlq $24, %%mm1 \n\t"\
886 "movd %%mm1, %%ecx \n\t"\
887 "paddb %%mm6, %%mm0 \n\t"\
888 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
889 "paddb %%mm6, %%mm0 \n\t"\
890 "movq %%mm0, " #a " \n\t"\
891
892 /*
893 HX1old((%0))
894 HX1old((%%eax))
895 HX1old((%%eax, %1))
896 HX1old((%%eax, %1, 2))
897 HX1old((%0, %1, 4))
898 HX1old((%%ebx))
899 HX1old((%%ebx, %1))
900 HX1old((%%ebx, %1, 2))
901 */
902
903 //FIXME add some comments, its unreadable ...
904 #define HX1b(a, c, b, d) \
905 "movd " #a ", %%mm0 \n\t"\
906 "movd 4" #a ", %%mm1 \n\t"\
907 "punpckldq %%mm1, %%mm0 \n\t"\
908 "movd " #b ", %%mm4 \n\t"\
909 "movq %%mm0, %%mm1 \n\t"\
910 "movq %%mm0, %%mm2 \n\t"\
911 "psrlq $8, %%mm1 \n\t"\
912 "movd 4" #b ", %%mm3 \n\t"\
913 "psubusb %%mm1, %%mm2 \n\t"\
914 "psubusb %%mm0, %%mm1 \n\t"\
915 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
916 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
917 "punpckldq %%mm3, %%mm4 \n\t"\
918 "movq %%mm1, %%mm3 \n\t"\
919 "psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
920 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
921 "paddb %%mm6, %%mm0 \n\t"\
922 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
923 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
924 "movq %%mm4, %%mm3 \n\t"\
925 "paddb %%mm5, %%mm1 \n\t"\
926 "psubusb %%mm5, %%mm1 \n\t"\
927 "psrlq $8, %%mm3 \n\t"\
928 PAVGB(%%mm7, %%mm1)\
929 "pxor %%mm2, %%mm1 \n\t"\
930 "psubb %%mm2, %%mm1 \n\t"\
931 "movq %%mm4, %%mm2 \n\t"\
932 "psrlq $24, %%mm1 \n\t"\
933 "psubusb %%mm3, %%mm2 \n\t"\
934 "movd %%mm1, %%ecx \n\t"\
935 "psubusb %%mm4, %%mm3 \n\t"\
936 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
937 "por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\
938 "paddb %%mm6, %%mm0 \n\t"\
939 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
940 "movq %%mm3, %%mm1 \n\t"\
941 "psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\
942 "movq %%mm0, " #a " \n\t"\
943 PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
944 "paddb %%mm6, %%mm4 \n\t"\
945 "psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
946 "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
947 "paddb %%mm5, %%mm3 \n\t"\
948 "psubusb %%mm5, %%mm3 \n\t"\
949 PAVGB(%%mm7, %%mm3)\
950 "pxor %%mm2, %%mm3 \n\t"\
951 "psubb %%mm2, %%mm3 \n\t"\
952 "psrlq $24, %%mm3 \n\t"\
953 "movd " #c ", %%mm0 \n\t"\
954 "movd 4" #c ", %%mm1 \n\t"\
955 "punpckldq %%mm1, %%mm0 \n\t"\
956 "paddb %%mm6, %%mm0 \n\t"\
957 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
958 "paddb %%mm6, %%mm0 \n\t"\
959 "movq %%mm0, " #c " \n\t"\
960 "movd %%mm3, %%ecx \n\t"\
961 "movd " #d ", %%mm0 \n\t"\
962 "paddsb (%3, %%ecx, 8), %%mm4 \n\t"\
963 "movd 4" #d ", %%mm1 \n\t"\
964 "paddb %%mm6, %%mm4 \n\t"\
965 "punpckldq %%mm1, %%mm0 \n\t"\
966 "movq %%mm4, " #b " \n\t"\
967 "paddb %%mm6, %%mm0 \n\t"\
968 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
969 "paddb %%mm6, %%mm0 \n\t"\
970 "movq %%mm0, " #d " \n\t"\
971
972 HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
973 HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
974
975
976 :
977 : "r" (src), "r" (stride), "r" (QP), "r" (lut)
978 : "%eax", "%ebx", "%ecx"
979 );
980 #else
981
982 //FIXME (has little in common with the mmx2 version)
983 for(y=0; y<BLOCK_SIZE; y++)
984 {
985 int a= src[1] - src[2];
986 int b= src[3] - src[4];
987 int c= src[5] - src[6];
988
989 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
990
991 if(d < QP)
992 {
993 int v = d * SIGN(-b);
994
995 src[1] +=v/8;
996 src[2] +=v/4;
997 src[3] +=3*v/8;
998 src[4] -=3*v/8;
999 src[5] -=v/4;
1000 src[6] -=v/8;
1001
1002 }
1003 src+=stride;
1004 }
789 #endif 1005 #endif
790 } 1006 }
791 1007
792 1008
793 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) 1009 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
1636 1852
1637 horizontal_size >>= 1; 1853 horizontal_size >>= 1;
1638 vertical_size >>= 1; 1854 vertical_size >>= 1;
1639 src_stride >>= 1; 1855 src_stride >>= 1;
1640 dst_stride >>= 1; 1856 dst_stride >>= 1;
1857 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
1641 1858
1642 if(1) 1859 if(1)
1643 { 1860 {
1644 postProcess(src[1], src_stride, dst[1], dst_stride, 1861 postProcess(src[1], src_stride, dst[1], dst_stride,
1645 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); 1862 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
1646 postProcess(src[2], src_stride, dst[2], dst_stride, 1863 postProcess(src[2], src_stride, dst[2], dst_stride,
1647 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); 1864 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
1648 } 1865 }
1649 else 1866 else
1650 { 1867 {
1651 memcpy(dst[1], src[1], src_stride*horizontal_size); 1868 memcpy(dst[1], src[1], src_stride*horizontal_size);
1652 memcpy(dst[2], src[2], src_stride*horizontal_size); 1869 memcpy(dst[2], src[2], src_stride*horizontal_size);
1927 memcpyTime+= T1-T0; 2144 memcpyTime+= T1-T0;
1928 T0=T1; 2145 T0=T1;
1929 #endif 2146 #endif
1930 if(mode & V_DEBLOCK) 2147 if(mode & V_DEBLOCK)
1931 { 2148 {
1932 if(mode & RK_FILTER) 2149 if(mode & V_RK1_FILTER)
1933 vertRKFilter(vertBlock, stride, QP); 2150 vertRK1Filter(vertBlock, stride, QP);
1934 else if(mode & X1_FILTER) 2151 else if(mode & V_X1_FILTER)
1935 vertX1Filter(vertBlock, stride, QP); 2152 vertX1Filter(vertBlock, stride, QP);
1936 else 2153 else
1937 { 2154 {
1938 if( isVertDC(vertBlock, stride)) 2155 if( isVertDC(vertBlock, stride))
1939 { 2156 {
1960 #ifdef MORE_TIMEING 2177 #ifdef MORE_TIMEING
1961 T0= rdtsc(); 2178 T0= rdtsc();
1962 #endif 2179 #endif
1963 if(mode & H_DEBLOCK) 2180 if(mode & H_DEBLOCK)
1964 { 2181 {
1965 if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) 2182 if(mode & H_X1_FILTER)
2183 horizX1Filter(dstBlock-4, stride, QP);
2184 else
1966 { 2185 {
1967 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) 2186 if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
1968 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); 2187 {
2188 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
2189 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
2190 }
2191 else
2192 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
1969 } 2193 }
1970 else
1971 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
1972 } 2194 }
1973 #ifdef MORE_TIMEING 2195 #ifdef MORE_TIMEING
1974 T1= rdtsc(); 2196 T1= rdtsc();
1975 horizTime+= T1-T0; 2197 horizTime+= T1-T0;
1976 T0=T1; 2198 T0=T1;