Mercurial > libavcodec.hg
comparison libpostproc/postprocess_template.c @ 99:4f072fa99ccf libavcodec
fixed a rounding bug thing in the X1 Filter
changed the X1 Filter slightly to make flat blocks look like in the 9tap lpf
minor change to the -pp numbers & added decimal numbers in comments
new experimental horizontal deblocking filter
author | michael |
---|---|
date | Sat, 13 Oct 2001 02:31:15 +0000 |
parents | eaae16507d9b |
children | 1d1182345591 |
comparison
equal
deleted
inserted
replaced
98:eaae16507d9b | 99:4f072fa99ccf |
---|---|
25 isHorizDC Ec Ec | 25 isHorizDC Ec Ec |
26 isHorizMinMaxOk a | 26 isHorizMinMaxOk a |
27 doHorizLowPass E a a* | 27 doHorizLowPass E a a* |
28 doHorizDefFilter E ac ac | 28 doHorizDefFilter E ac ac |
29 deRing | 29 deRing |
30 RKAlgo1 E a a* | 30 Vertical RKAlgo1 E a a* |
31 X1 a E E* | 31 Vertical X1 a E E* |
32 Horizontal X1 a E E* | |
32 | 33 |
33 | 34 |
34 * i dont have a 3dnow CPU -> its untested | 35 * i dont have a 3dnow CPU -> its untested |
35 E = Exact implementation | 36 E = Exact implementation |
36 e = allmost exact implementation | 37 e = allmost exact implementation |
38 c = checked against the other implementations (-vo md5) | 39 c = checked against the other implementations (-vo md5) |
39 */ | 40 */ |
40 | 41 |
41 /* | 42 /* |
42 TODO: | 43 TODO: |
43 verify that everything workes as it should | 44 verify that everything workes as it should (how?) |
44 reduce the time wasted on the mem transfer | 45 reduce the time wasted on the mem transfer |
45 implement dering | 46 implement dering |
46 implement everything in C at least (done at the moment but ...) | 47 implement everything in C at least (done at the moment but ...) |
47 unroll stuff if instructions depend too much on the prior one | 48 unroll stuff if instructions depend too much on the prior one |
48 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? | 49 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? |
49 move YScale thing to the end instead of fixing QP | 50 move YScale thing to the end instead of fixing QP |
50 write a faster and higher quality deblocking filter :) | 51 write a faster and higher quality deblocking filter :) |
51 do something about the speed of the horizontal filters | 52 do something about the speed of the horizontal filters |
52 make the mainloop more flexible (variable number of blocks at once | 53 make the mainloop more flexible (variable number of blocks at once |
53 (the if/else stuff per block is slowing things down) | 54 (the if/else stuff per block is slowing things down) |
55 compare the quality & speed of all filters | |
56 implement a few simple deinterlacing filters | |
57 split this huge file | |
54 ... | 58 ... |
55 | 59 |
56 Notes: | 60 Notes: |
57 | 61 |
58 */ | 62 */ |
59 | 63 |
60 /* | 64 /* |
61 Changelog: | 65 Changelog: use the CVS log |
62 0.1.3 | 66 0.1.3 |
63 bugfixes: last 3 lines not brightness/contrast corrected | 67 bugfixes: last 3 lines not brightness/contrast corrected |
64 brightness statistics messed up with initial black pic | 68 brightness statistics messed up with initial black pic |
65 changed initial values of the brightness statistics | 69 changed initial values of the brightness statistics |
66 C++ -> C conversation | 70 C++ -> C conversation |
97 static uint64_t bm00001000= 0x00000000FF000000LL; | 101 static uint64_t bm00001000= 0x00000000FF000000LL; |
98 static uint64_t bm10000000= 0xFF00000000000000LL; | 102 static uint64_t bm10000000= 0xFF00000000000000LL; |
99 static uint64_t bm10000001= 0xFF000000000000FFLL; | 103 static uint64_t bm10000001= 0xFF000000000000FFLL; |
100 static uint64_t bm11000011= 0xFFFF00000000FFFFLL; | 104 static uint64_t bm11000011= 0xFFFF00000000FFFFLL; |
101 static uint64_t bm00000011= 0x000000000000FFFFLL; | 105 static uint64_t bm00000011= 0x000000000000FFFFLL; |
106 static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL; | |
102 static uint64_t bm11000000= 0xFFFF000000000000LL; | 107 static uint64_t bm11000000= 0xFFFF000000000000LL; |
103 static uint64_t bm00011000= 0x000000FFFF000000LL; | 108 static uint64_t bm00011000= 0x000000FFFF000000LL; |
104 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL; | 109 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL; |
105 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL; | 110 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL; |
106 static uint64_t b00= 0x0000000000000000LL; | 111 static uint64_t b00= 0x0000000000000000LL; |
112 static uint64_t b01= 0x0101010101010101LL; | |
107 static uint64_t b02= 0x0202020202020202LL; | 113 static uint64_t b02= 0x0202020202020202LL; |
108 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL; | 114 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL; |
109 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL; | 115 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL; |
110 static uint64_t b20= 0x2020202020202020LL; | 116 static uint64_t b20= 0x2020202020202020LL; |
111 static uint64_t b80= 0x8080808080808080LL; | 117 static uint64_t b80= 0x8080808080808080LL; |
542 x = 8 | 548 x = 8 |
543 x/2 = 4 | 549 x/2 = 4 |
544 x/8 = 1 | 550 x/8 = 1 |
545 1 12 12 23 | 551 1 12 12 23 |
546 */ | 552 */ |
547 static inline void vertRKFilter(uint8_t *src, int stride, int QP) | 553 static inline void vertRK1Filter(uint8_t *src, int stride, int QP) |
548 { | 554 { |
549 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 555 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
550 // FIXME rounding | 556 // FIXME rounding |
551 asm volatile( | 557 asm volatile( |
552 "pxor %%mm7, %%mm7 \n\t" // 0 | 558 "pxor %%mm7, %%mm7 \n\t" // 0 |
636 #endif | 642 #endif |
637 } | 643 } |
638 | 644 |
639 /** | 645 /** |
640 * Experimental Filter 1 | 646 * Experimental Filter 1 |
641 * will nor damage linear gradients | 647 * will not damage linear gradients |
648 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | |
642 * can only smooth blocks at the expected locations (it cant smooth them if they did move) | 649 * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
643 * MMX2 version does correct clipping C version doesnt | 650 * MMX2 version does correct clipping C version doesnt |
644 */ | 651 */ |
645 static inline void vertX1Filter(uint8_t *src, int stride, int QP) | 652 static inline void vertX1Filter(uint8_t *src, int stride, int QP) |
646 { | 653 { |
673 "por %%mm5, %%mm4 \n\t" // |l4 - l5| | 680 "por %%mm5, %%mm4 \n\t" // |l4 - l5| |
674 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) | 681 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) |
675 "movq %%mm4, %%mm3 \n\t" // d | 682 "movq %%mm4, %%mm3 \n\t" // d |
676 "psubusb pQPb, %%mm4 \n\t" | 683 "psubusb pQPb, %%mm4 \n\t" |
677 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 | 684 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 |
685 "psubusb b01, %%mm3 \n\t" | |
678 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 | 686 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 |
679 | 687 |
680 PAVGB(%%mm7, %%mm3) // d/2 | 688 PAVGB(%%mm7, %%mm3) // d/2 |
689 "movq %%mm3, %%mm1 \n\t" // d/2 | |
690 PAVGB(%%mm7, %%mm3) // d/4 | |
691 PAVGB(%%mm1, %%mm3) // 3*d/8 | |
681 | 692 |
682 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 | 693 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 |
683 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 | 694 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
684 "psubusb %%mm3, %%mm0 \n\t" | 695 "psubusb %%mm3, %%mm0 \n\t" |
685 "pxor %%mm2, %%mm0 \n\t" | 696 "pxor %%mm2, %%mm0 \n\t" |
689 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 | 700 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
690 "paddusb %%mm3, %%mm0 \n\t" | 701 "paddusb %%mm3, %%mm0 \n\t" |
691 "pxor %%mm2, %%mm0 \n\t" | 702 "pxor %%mm2, %%mm0 \n\t" |
692 "movq %%mm0, (%%ebx) \n\t" // line 5 | 703 "movq %%mm0, (%%ebx) \n\t" // line 5 |
693 | 704 |
694 PAVGB(%%mm7, %%mm3) // d/4 | 705 PAVGB(%%mm7, %%mm1) // d/4 |
695 | 706 |
696 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 | 707 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
697 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 | 708 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
698 "psubusb %%mm3, %%mm0 \n\t" | 709 "psubusb %%mm1, %%mm0 \n\t" |
699 "pxor %%mm2, %%mm0 \n\t" | 710 "pxor %%mm2, %%mm0 \n\t" |
700 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 | 711 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 |
701 | 712 |
702 "movq (%%ebx, %1), %%mm0 \n\t" // line 6 | 713 "movq (%%ebx, %1), %%mm0 \n\t" // line 6 |
703 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 | 714 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
704 "paddusb %%mm3, %%mm0 \n\t" | 715 "paddusb %%mm1, %%mm0 \n\t" |
705 "pxor %%mm2, %%mm0 \n\t" | 716 "pxor %%mm2, %%mm0 \n\t" |
706 "movq %%mm0, (%%ebx, %1) \n\t" // line 6 | 717 "movq %%mm0, (%%ebx, %1) \n\t" // line 6 |
707 | 718 |
708 PAVGB(%%mm7, %%mm3) // d/8 | 719 PAVGB(%%mm7, %%mm1) // d/8 |
709 | 720 |
710 "movq (%%eax, %1), %%mm0 \n\t" // line 2 | 721 "movq (%%eax, %1), %%mm0 \n\t" // line 2 |
711 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 | 722 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 |
712 "psubusb %%mm3, %%mm0 \n\t" | 723 "psubusb %%mm1, %%mm0 \n\t" |
713 "pxor %%mm2, %%mm0 \n\t" | 724 "pxor %%mm2, %%mm0 \n\t" |
714 "movq %%mm0, (%%eax, %1) \n\t" // line 2 | 725 "movq %%mm0, (%%eax, %1) \n\t" // line 2 |
715 | 726 |
716 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7 | 727 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7 |
717 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 | 728 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 |
718 "paddusb %%mm3, %%mm0 \n\t" | 729 "paddusb %%mm1, %%mm0 \n\t" |
719 "pxor %%mm2, %%mm0 \n\t" | 730 "pxor %%mm2, %%mm0 \n\t" |
720 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7 | 731 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7 |
721 | 732 |
722 : | 733 : |
723 : "r" (src), "r" (stride) | 734 : "r" (src), "r" (stride) |
737 int x; | 748 int x; |
738 for(x=0; x<BLOCK_SIZE; x++) | 749 for(x=0; x<BLOCK_SIZE; x++) |
739 { | 750 { |
740 int a= src[l3] - src[l4]; | 751 int a= src[l3] - src[l4]; |
741 int b= src[l4] - src[l5]; | 752 int b= src[l4] - src[l5]; |
742 int c= src[l6] - src[l7]; | 753 int c= src[l5] - src[l6]; |
743 | 754 |
744 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); | 755 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); |
745 | 756 |
746 if(d < QP) | 757 if(d < QP) |
747 { | 758 { |
748 int v = d * SIGN(-b); | 759 int v = d * SIGN(-b); |
749 | 760 |
750 src[l2] +=v/8; | 761 src[l2] +=v/8; |
751 src[l3] +=v/4; | 762 src[l3] +=v/4; |
752 src[l4] +=v/2; | 763 src[l4] +=3*v/8; |
753 src[l5] -=v/2; | 764 src[l5] -=3*v/8; |
754 src[l6] -=v/4; | 765 src[l6] -=v/4; |
755 src[l7] -=v/8; | 766 src[l7] -=v/8; |
756 | 767 |
757 } | 768 } |
758 src++; | 769 src++; |
784 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16; | 795 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16; |
785 } | 796 } |
786 src++; | 797 src++; |
787 } | 798 } |
788 */ | 799 */ |
800 #endif | |
801 } | |
802 | |
803 /** | |
804 * Experimental Filter 1 (Horizontal) | |
805 * will not damage linear gradients | |
806 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | |
807 * can only smooth blocks at the expected locations (it cant smooth them if they did move) | |
808 * MMX2 version does correct clipping C version doesnt | |
809 * not identical with the vertical one | |
810 */ | |
811 static inline void horizX1Filter(uint8_t *src, int stride, int QP) | |
812 { | |
813 int y; | |
814 static uint64_t *lut= NULL; | |
815 if(lut==NULL) | |
816 { | |
817 int i; | |
818 lut= (uint64_t*)memalign(8, 256*8); | |
819 for(i=0; i<256; i++) | |
820 { | |
821 int v= i < 128 ? 2*i : 2*(i-256); | |
822 /* | |
823 //Simulate 112242211 9-Tap filter | |
824 uint64_t a= (v/16) & 0xFF; | |
825 uint64_t b= (v/8) & 0xFF; | |
826 uint64_t c= (v/4) & 0xFF; | |
827 uint64_t d= (3*v/8) & 0xFF; | |
828 */ | |
829 //Simulate piecewise linear interpolation | |
830 uint64_t a= (v/16) & 0xFF; | |
831 uint64_t b= (v*3/16) & 0xFF; | |
832 uint64_t c= (v*5/16) & 0xFF; | |
833 uint64_t d= (7*v/16) & 0xFF; | |
834 uint64_t A= (0x100 - a)&0xFF; | |
835 uint64_t B= (0x100 - b)&0xFF; | |
836 uint64_t C= (0x100 - c)&0xFF; | |
837 uint64_t D= (0x100 - c)&0xFF; | |
838 | |
839 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | | |
840 (D<<24) | (C<<16) | (B<<8) | (A); | |
841 //lut[i] = (v<<32) | (v<<24); | |
842 } | |
843 } | |
844 | |
845 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
846 asm volatile( | |
847 "pxor %%mm7, %%mm7 \n\t" // 0 | |
848 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | |
849 "leal (%0, %1), %%eax \n\t" | |
850 "leal (%%eax, %1, 4), %%ebx \n\t" | |
851 | |
852 "movq b80, %%mm6 \n\t" | |
853 "movd %2, %%mm5 \n\t" // QP | |
854 "movq %%mm5, %%mm4 \n\t" | |
855 "paddusb %%mm5, %%mm5 \n\t" // 2QP | |
856 "paddusb %%mm5, %%mm4 \n\t" // 3QP | |
857 "pxor %%mm5, %%mm5 \n\t" // 0 | |
858 "psubb %%mm4, %%mm5 \n\t" // -3QP | |
859 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP | |
860 "psllq $24, %%mm5 \n\t" | |
861 | |
862 // 0 1 2 3 4 5 6 7 8 9 | |
863 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
864 | |
865 #define HX1old(a) \ | |
866 "movd " #a ", %%mm0 \n\t"\ | |
867 "movd 4" #a ", %%mm1 \n\t"\ | |
868 "punpckldq %%mm1, %%mm0 \n\t"\ | |
869 "movq %%mm0, %%mm1 \n\t"\ | |
870 "movq %%mm0, %%mm2 \n\t"\ | |
871 "psrlq $8, %%mm1 \n\t"\ | |
872 "psubusb %%mm1, %%mm2 \n\t"\ | |
873 "psubusb %%mm0, %%mm1 \n\t"\ | |
874 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ | |
875 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
876 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\ | |
877 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\ | |
878 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\ | |
879 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ | |
880 "paddb %%mm5, %%mm1 \n\t"\ | |
881 "psubusb %%mm5, %%mm1 \n\t"\ | |
882 PAVGB(%%mm7, %%mm1)\ | |
883 "pxor %%mm2, %%mm1 \n\t"\ | |
884 "psubb %%mm2, %%mm1 \n\t"\ | |
885 "psrlq $24, %%mm1 \n\t"\ | |
886 "movd %%mm1, %%ecx \n\t"\ | |
887 "paddb %%mm6, %%mm0 \n\t"\ | |
888 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ | |
889 "paddb %%mm6, %%mm0 \n\t"\ | |
890 "movq %%mm0, " #a " \n\t"\ | |
891 | |
892 /* | |
893 HX1old((%0)) | |
894 HX1old((%%eax)) | |
895 HX1old((%%eax, %1)) | |
896 HX1old((%%eax, %1, 2)) | |
897 HX1old((%0, %1, 4)) | |
898 HX1old((%%ebx)) | |
899 HX1old((%%ebx, %1)) | |
900 HX1old((%%ebx, %1, 2)) | |
901 */ | |
902 | |
903 //FIXME add some comments, its unreadable ... | |
904 #define HX1b(a, c, b, d) \ | |
905 "movd " #a ", %%mm0 \n\t"\ | |
906 "movd 4" #a ", %%mm1 \n\t"\ | |
907 "punpckldq %%mm1, %%mm0 \n\t"\ | |
908 "movd " #b ", %%mm4 \n\t"\ | |
909 "movq %%mm0, %%mm1 \n\t"\ | |
910 "movq %%mm0, %%mm2 \n\t"\ | |
911 "psrlq $8, %%mm1 \n\t"\ | |
912 "movd 4" #b ", %%mm3 \n\t"\ | |
913 "psubusb %%mm1, %%mm2 \n\t"\ | |
914 "psubusb %%mm0, %%mm1 \n\t"\ | |
915 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ | |
916 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
917 "punpckldq %%mm3, %%mm4 \n\t"\ | |
918 "movq %%mm1, %%mm3 \n\t"\ | |
919 "psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\ | |
920 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\ | |
921 "paddb %%mm6, %%mm0 \n\t"\ | |
922 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\ | |
923 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ | |
924 "movq %%mm4, %%mm3 \n\t"\ | |
925 "paddb %%mm5, %%mm1 \n\t"\ | |
926 "psubusb %%mm5, %%mm1 \n\t"\ | |
927 "psrlq $8, %%mm3 \n\t"\ | |
928 PAVGB(%%mm7, %%mm1)\ | |
929 "pxor %%mm2, %%mm1 \n\t"\ | |
930 "psubb %%mm2, %%mm1 \n\t"\ | |
931 "movq %%mm4, %%mm2 \n\t"\ | |
932 "psrlq $24, %%mm1 \n\t"\ | |
933 "psubusb %%mm3, %%mm2 \n\t"\ | |
934 "movd %%mm1, %%ecx \n\t"\ | |
935 "psubusb %%mm4, %%mm3 \n\t"\ | |
936 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ | |
937 "por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\ | |
938 "paddb %%mm6, %%mm0 \n\t"\ | |
939 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
940 "movq %%mm3, %%mm1 \n\t"\ | |
941 "psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\ | |
942 "movq %%mm0, " #a " \n\t"\ | |
943 PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\ | |
944 "paddb %%mm6, %%mm4 \n\t"\ | |
945 "psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\ | |
946 "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ | |
947 "paddb %%mm5, %%mm3 \n\t"\ | |
948 "psubusb %%mm5, %%mm3 \n\t"\ | |
949 PAVGB(%%mm7, %%mm3)\ | |
950 "pxor %%mm2, %%mm3 \n\t"\ | |
951 "psubb %%mm2, %%mm3 \n\t"\ | |
952 "psrlq $24, %%mm3 \n\t"\ | |
953 "movd " #c ", %%mm0 \n\t"\ | |
954 "movd 4" #c ", %%mm1 \n\t"\ | |
955 "punpckldq %%mm1, %%mm0 \n\t"\ | |
956 "paddb %%mm6, %%mm0 \n\t"\ | |
957 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ | |
958 "paddb %%mm6, %%mm0 \n\t"\ | |
959 "movq %%mm0, " #c " \n\t"\ | |
960 "movd %%mm3, %%ecx \n\t"\ | |
961 "movd " #d ", %%mm0 \n\t"\ | |
962 "paddsb (%3, %%ecx, 8), %%mm4 \n\t"\ | |
963 "movd 4" #d ", %%mm1 \n\t"\ | |
964 "paddb %%mm6, %%mm4 \n\t"\ | |
965 "punpckldq %%mm1, %%mm0 \n\t"\ | |
966 "movq %%mm4, " #b " \n\t"\ | |
967 "paddb %%mm6, %%mm0 \n\t"\ | |
968 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ | |
969 "paddb %%mm6, %%mm0 \n\t"\ | |
970 "movq %%mm0, " #d " \n\t"\ | |
971 | |
972 HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2)) | |
973 HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2)) | |
974 | |
975 | |
976 : | |
977 : "r" (src), "r" (stride), "r" (QP), "r" (lut) | |
978 : "%eax", "%ebx", "%ecx" | |
979 ); | |
980 #else | |
981 | |
982 //FIXME (has little in common with the mmx2 version) | |
983 for(y=0; y<BLOCK_SIZE; y++) | |
984 { | |
985 int a= src[1] - src[2]; | |
986 int b= src[3] - src[4]; | |
987 int c= src[5] - src[6]; | |
988 | |
989 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); | |
990 | |
991 if(d < QP) | |
992 { | |
993 int v = d * SIGN(-b); | |
994 | |
995 src[1] +=v/8; | |
996 src[2] +=v/4; | |
997 src[3] +=3*v/8; | |
998 src[4] -=3*v/8; | |
999 src[5] -=v/4; | |
1000 src[6] -=v/8; | |
1001 | |
1002 } | |
1003 src+=stride; | |
1004 } | |
789 #endif | 1005 #endif |
790 } | 1006 } |
791 | 1007 |
792 | 1008 |
793 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) | 1009 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) |
1636 | 1852 |
1637 horizontal_size >>= 1; | 1853 horizontal_size >>= 1; |
1638 vertical_size >>= 1; | 1854 vertical_size >>= 1; |
1639 src_stride >>= 1; | 1855 src_stride >>= 1; |
1640 dst_stride >>= 1; | 1856 dst_stride >>= 1; |
1857 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00); | |
1641 | 1858 |
1642 if(1) | 1859 if(1) |
1643 { | 1860 { |
1644 postProcess(src[1], src_stride, dst[1], dst_stride, | 1861 postProcess(src[1], src_stride, dst[1], dst_stride, |
1645 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); | 1862 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode); |
1646 postProcess(src[2], src_stride, dst[2], dst_stride, | 1863 postProcess(src[2], src_stride, dst[2], dst_stride, |
1647 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); | 1864 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode); |
1648 } | 1865 } |
1649 else | 1866 else |
1650 { | 1867 { |
1651 memcpy(dst[1], src[1], src_stride*horizontal_size); | 1868 memcpy(dst[1], src[1], src_stride*horizontal_size); |
1652 memcpy(dst[2], src[2], src_stride*horizontal_size); | 1869 memcpy(dst[2], src[2], src_stride*horizontal_size); |
1927 memcpyTime+= T1-T0; | 2144 memcpyTime+= T1-T0; |
1928 T0=T1; | 2145 T0=T1; |
1929 #endif | 2146 #endif |
1930 if(mode & V_DEBLOCK) | 2147 if(mode & V_DEBLOCK) |
1931 { | 2148 { |
1932 if(mode & RK_FILTER) | 2149 if(mode & V_RK1_FILTER) |
1933 vertRKFilter(vertBlock, stride, QP); | 2150 vertRK1Filter(vertBlock, stride, QP); |
1934 else if(mode & X1_FILTER) | 2151 else if(mode & V_X1_FILTER) |
1935 vertX1Filter(vertBlock, stride, QP); | 2152 vertX1Filter(vertBlock, stride, QP); |
1936 else | 2153 else |
1937 { | 2154 { |
1938 if( isVertDC(vertBlock, stride)) | 2155 if( isVertDC(vertBlock, stride)) |
1939 { | 2156 { |
1960 #ifdef MORE_TIMEING | 2177 #ifdef MORE_TIMEING |
1961 T0= rdtsc(); | 2178 T0= rdtsc(); |
1962 #endif | 2179 #endif |
1963 if(mode & H_DEBLOCK) | 2180 if(mode & H_DEBLOCK) |
1964 { | 2181 { |
1965 if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) | 2182 if(mode & H_X1_FILTER) |
2183 horizX1Filter(dstBlock-4, stride, QP); | |
2184 else | |
1966 { | 2185 { |
1967 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) | 2186 if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) |
1968 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); | 2187 { |
2188 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) | |
2189 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); | |
2190 } | |
2191 else | |
2192 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); | |
1969 } | 2193 } |
1970 else | |
1971 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); | |
1972 } | 2194 } |
1973 #ifdef MORE_TIMEING | 2195 #ifdef MORE_TIMEING |
1974 T1= rdtsc(); | 2196 T1= rdtsc(); |
1975 horizTime+= T1-T0; | 2197 horizTime+= T1-T0; |
1976 T0=T1; | 2198 T0=T1; |