Mercurial > mplayer.hg
comparison postproc/postprocess.c @ 3031:86e1a0f4f0bc
cleanup
precopy fewer lines from src to dst if possible
speedup (due to cleanup of blockcopy)
author | michael |
---|---|
date | Tue, 20 Nov 2001 17:47:52 +0000 |
parents | 71384f064a3e |
children | 6de073cf52b5 |
comparison
equal
deleted
inserted
replaced
3030:1dbc569b6528 | 3031:86e1a0f4f0bc |
---|---|
60 split this huge file | 60 split this huge file |
61 border remover | 61 border remover |
62 optimize c versions | 62 optimize c versions |
63 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks | 63 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks |
64 smart blur | 64 smart blur |
65 commandline option for the deblock thresholds | |
65 ... | 66 ... |
66 */ | 67 */ |
67 | 68 |
68 //Changelog: use the CVS log | 69 //Changelog: use the CVS log |
69 | 70 |
856 } | 857 } |
857 */ | 858 */ |
858 #endif | 859 #endif |
859 } | 860 } |
860 | 861 |
861 /** | |
862 * Experimental Filter 1 (Horizontal) | |
863 * will not damage linear gradients | |
864 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | |
865 * can only smooth blocks at the expected locations (it cant smooth them if they did move) | |
866 * MMX2 version does correct clipping C version doesnt | |
867 * not identical with the vertical one | |
868 */ | |
869 static inline void horizX1Filter(uint8_t *src, int stride, int QP) | |
870 { | |
871 int y; | |
872 static uint64_t *lut= NULL; | |
873 if(lut==NULL) | |
874 { | |
875 int i; | |
876 lut= (uint64_t*)memalign(8, 256*8); | |
877 for(i=0; i<256; i++) | |
878 { | |
879 int v= i < 128 ? 2*i : 2*(i-256); | |
880 /* | |
881 //Simulate 112242211 9-Tap filter | |
882 uint64_t a= (v/16) & 0xFF; | |
883 uint64_t b= (v/8) & 0xFF; | |
884 uint64_t c= (v/4) & 0xFF; | |
885 uint64_t d= (3*v/8) & 0xFF; | |
886 */ | |
887 //Simulate piecewise linear interpolation | |
888 uint64_t a= (v/16) & 0xFF; | |
889 uint64_t b= (v*3/16) & 0xFF; | |
890 uint64_t c= (v*5/16) & 0xFF; | |
891 uint64_t d= (7*v/16) & 0xFF; | |
892 uint64_t A= (0x100 - a)&0xFF; | |
893 uint64_t B= (0x100 - b)&0xFF; | |
894 uint64_t C= (0x100 - c)&0xFF; | |
895 uint64_t D= (0x100 - c)&0xFF; | |
896 | |
897 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | | |
898 (D<<24) | (C<<16) | (B<<8) | (A); | |
899 //lut[i] = (v<<32) | (v<<24); | |
900 } | |
901 } | |
902 | |
903 #if 0 | |
904 asm volatile( | |
905 "pxor %%mm7, %%mm7 \n\t" // 0 | |
906 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | |
907 "leal (%0, %1), %%eax \n\t" | |
908 "leal (%%eax, %1, 4), %%ebx \n\t" | |
909 | |
910 "movq b80, %%mm6 \n\t" | |
911 "movd pQPb, %%mm5 \n\t" // QP | |
912 "movq %%mm5, %%mm4 \n\t" | |
913 "paddusb %%mm5, %%mm5 \n\t" // 2QP | |
914 "paddusb %%mm5, %%mm4 \n\t" // 3QP | |
915 "pxor %%mm5, %%mm5 \n\t" // 0 | |
916 "psubb %%mm4, %%mm5 \n\t" // -3QP | |
917 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP | |
918 "psllq $24, %%mm5 \n\t" | |
919 | |
920 // 0 1 2 3 4 5 6 7 8 9 | |
921 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
922 | |
923 #define HX1old(a) \ | |
924 "movd " #a ", %%mm0 \n\t"\ | |
925 "movd 4" #a ", %%mm1 \n\t"\ | |
926 "punpckldq %%mm1, %%mm0 \n\t"\ | |
927 "movq %%mm0, %%mm1 \n\t"\ | |
928 "movq %%mm0, %%mm2 \n\t"\ | |
929 "psrlq $8, %%mm1 \n\t"\ | |
930 "psubusb %%mm1, %%mm2 \n\t"\ | |
931 "psubusb %%mm0, %%mm1 \n\t"\ | |
932 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ | |
933 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
934 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\ | |
935 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\ | |
936 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\ | |
937 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ | |
938 "paddb %%mm5, %%mm1 \n\t"\ | |
939 "psubusb %%mm5, %%mm1 \n\t"\ | |
940 PAVGB(%%mm7, %%mm1)\ | |
941 "pxor %%mm2, %%mm1 \n\t"\ | |
942 "psubb %%mm2, %%mm1 \n\t"\ | |
943 "psrlq $24, %%mm1 \n\t"\ | |
944 "movd %%mm1, %%ecx \n\t"\ | |
945 "paddb %%mm6, %%mm0 \n\t"\ | |
946 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ | |
947 "paddb %%mm6, %%mm0 \n\t"\ | |
948 "movq %%mm0, " #a " \n\t"\ | |
949 | |
950 /* | |
951 HX1old((%0)) | |
952 HX1old((%%eax)) | |
953 HX1old((%%eax, %1)) | |
954 HX1old((%%eax, %1, 2)) | |
955 HX1old((%0, %1, 4)) | |
956 HX1old((%%ebx)) | |
957 HX1old((%%ebx, %1)) | |
958 HX1old((%%ebx, %1, 2)) | |
959 */ | |
960 | |
961 //FIXME add some comments, its unreadable ... | |
962 #define HX1b(a, c, b, d) \ | |
963 "movd " #a ", %%mm0 \n\t"\ | |
964 "movd 4" #a ", %%mm1 \n\t"\ | |
965 "punpckldq %%mm1, %%mm0 \n\t"\ | |
966 "movd " #b ", %%mm4 \n\t"\ | |
967 "movq %%mm0, %%mm1 \n\t"\ | |
968 "movq %%mm0, %%mm2 \n\t"\ | |
969 "psrlq $8, %%mm1 \n\t"\ | |
970 "movd 4" #b ", %%mm3 \n\t"\ | |
971 "psubusb %%mm1, %%mm2 \n\t"\ | |
972 "psubusb %%mm0, %%mm1 \n\t"\ | |
973 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ | |
974 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
975 "punpckldq %%mm3, %%mm4 \n\t"\ | |
976 "movq %%mm1, %%mm3 \n\t"\ | |
977 "psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\ | |
978 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\ | |
979 "paddb %%mm6, %%mm0 \n\t"\ | |
980 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\ | |
981 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ | |
982 "movq %%mm4, %%mm3 \n\t"\ | |
983 "paddb %%mm5, %%mm1 \n\t"\ | |
984 "psubusb %%mm5, %%mm1 \n\t"\ | |
985 "psrlq $8, %%mm3 \n\t"\ | |
986 PAVGB(%%mm7, %%mm1)\ | |
987 "pxor %%mm2, %%mm1 \n\t"\ | |
988 "psubb %%mm2, %%mm1 \n\t"\ | |
989 "movq %%mm4, %%mm2 \n\t"\ | |
990 "psrlq $24, %%mm1 \n\t"\ | |
991 "psubusb %%mm3, %%mm2 \n\t"\ | |
992 "movd %%mm1, %%ecx \n\t"\ | |
993 "psubusb %%mm4, %%mm3 \n\t"\ | |
994 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\ | |
995 "por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\ | |
996 "paddb %%mm6, %%mm0 \n\t"\ | |
997 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
998 "movq %%mm3, %%mm1 \n\t"\ | |
999 "psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\ | |
1000 "movq %%mm0, " #a " \n\t"\ | |
1001 PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\ | |
1002 "paddb %%mm6, %%mm4 \n\t"\ | |
1003 "psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\ | |
1004 "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ | |
1005 "paddb %%mm5, %%mm3 \n\t"\ | |
1006 "psubusb %%mm5, %%mm3 \n\t"\ | |
1007 PAVGB(%%mm7, %%mm3)\ | |
1008 "pxor %%mm2, %%mm3 \n\t"\ | |
1009 "psubb %%mm2, %%mm3 \n\t"\ | |
1010 "psrlq $24, %%mm3 \n\t"\ | |
1011 "movd " #c ", %%mm0 \n\t"\ | |
1012 "movd 4" #c ", %%mm1 \n\t"\ | |
1013 "punpckldq %%mm1, %%mm0 \n\t"\ | |
1014 "paddb %%mm6, %%mm0 \n\t"\ | |
1015 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\ | |
1016 "paddb %%mm6, %%mm0 \n\t"\ | |
1017 "movq %%mm0, " #c " \n\t"\ | |
1018 "movd %%mm3, %%ecx \n\t"\ | |
1019 "movd " #d ", %%mm0 \n\t"\ | |
1020 "paddsb (%2, %%ecx, 8), %%mm4 \n\t"\ | |
1021 "movd 4" #d ", %%mm1 \n\t"\ | |
1022 "paddb %%mm6, %%mm4 \n\t"\ | |
1023 "punpckldq %%mm1, %%mm0 \n\t"\ | |
1024 "movq %%mm4, " #b " \n\t"\ | |
1025 "paddb %%mm6, %%mm0 \n\t"\ | |
1026 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\ | |
1027 "paddb %%mm6, %%mm0 \n\t"\ | |
1028 "movq %%mm0, " #d " \n\t"\ | |
1029 | |
1030 HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2)) | |
1031 HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2)) | |
1032 | |
1033 | |
1034 : | |
1035 : "r" (src), "r" (stride), "r" (lut) | |
1036 : "%eax", "%ebx", "%ecx" | |
1037 ); | |
1038 #else | |
1039 | |
1040 //FIXME (has little in common with the mmx2 version) | |
1041 for(y=0; y<BLOCK_SIZE; y++) | |
1042 { | |
1043 int a= src[1] - src[2]; | |
1044 int b= src[3] - src[4]; | |
1045 int c= src[5] - src[6]; | |
1046 | |
1047 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); | |
1048 | |
1049 if(d < QP) | |
1050 { | |
1051 int v = d * SIGN(-b); | |
1052 | |
1053 src[1] +=v/8; | |
1054 src[2] +=v/4; | |
1055 src[3] +=3*v/8; | |
1056 src[4] -=3*v/8; | |
1057 src[5] -=v/4; | |
1058 src[6] -=v/8; | |
1059 | |
1060 } | |
1061 src+=stride; | |
1062 } | |
1063 #endif | |
1064 } | |
1065 | |
1066 | |
1067 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) | 862 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) |
1068 { | 863 { |
1069 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 864 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1070 /* | 865 /* |
1071 uint8_t tmp[16]; | 866 uint8_t tmp[16]; |
1637 | 1432 |
1638 src[l4]-= d; | 1433 src[l4]-= d; |
1639 src[l5]+= d; | 1434 src[l5]+= d; |
1640 } | 1435 } |
1641 src++; | 1436 src++; |
1642 } | |
1643 #endif | |
1644 } | |
1645 | |
1646 //FIXME? |255-0| = 1 | |
1647 /** | |
1648 * Check if the given 8x8 Block is mostly "flat" | |
1649 */ | |
1650 static inline int isHorizDC(uint8_t src[], int stride) | |
1651 { | |
1652 // src++; | |
1653 int numEq= 0; | |
1654 #if 0 | |
1655 asm volatile ( | |
1656 // "int $3 \n\t" | |
1657 "leal (%1, %2), %%ecx \n\t" | |
1658 "leal (%%ecx, %2, 4), %%ebx \n\t" | |
1659 // 0 1 2 3 4 5 6 7 8 9 | |
1660 // %1 ecx ecx+%2 ecx+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2 | |
1661 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F | |
1662 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D | |
1663 "pxor %%mm0, %%mm0 \n\t" | |
1664 "movl %1, %%eax \n\t" | |
1665 "andl $0x1F, %%eax \n\t" | |
1666 "cmpl $24, %%eax \n\t" | |
1667 "leal tempBlock, %%eax \n\t" | |
1668 "jb 1f \n\t" | |
1669 | |
1670 #define HDC_CHECK_AND_CPY(src, dst) \ | |
1671 "movd " #src ", %%mm2 \n\t"\ | |
1672 "punpckldq 4" #src ", %%mm2 \n\t" /* (%1) */\ | |
1673 "movq %%mm2, %%mm1 \n\t"\ | |
1674 "psrlq $8, %%mm2 \n\t"\ | |
1675 "psubb %%mm1, %%mm2 \n\t"\ | |
1676 "paddb %%mm7, %%mm2 \n\t"\ | |
1677 "pcmpgtb %%mm6, %%mm2 \n\t"\ | |
1678 "paddb %%mm2, %%mm0 \n\t"\ | |
1679 "movq %%mm1," #dst "(%%eax) \n\t" | |
1680 | |
1681 HDC_CHECK_AND_CPY((%1),0) | |
1682 HDC_CHECK_AND_CPY((%%ecx),8) | |
1683 HDC_CHECK_AND_CPY((%%ecx, %2),16) | |
1684 HDC_CHECK_AND_CPY((%%ecx, %2, 2),24) | |
1685 HDC_CHECK_AND_CPY((%1, %2, 4),32) | |
1686 HDC_CHECK_AND_CPY((%%ebx),40) | |
1687 HDC_CHECK_AND_CPY((%%ebx, %2),48) | |
1688 HDC_CHECK_AND_CPY((%%ebx, %2, 2),56) | |
1689 "jmp 2f \n\t" | |
1690 "1: \n\t" | |
1691 // src does not cross a 32 byte cache line so dont waste time with alignment | |
1692 #define HDC_CHECK_AND_CPY2(src, dst) \ | |
1693 "movq " #src ", %%mm2 \n\t"\ | |
1694 "movq " #src ", %%mm1 \n\t"\ | |
1695 "psrlq $8, %%mm2 \n\t"\ | |
1696 "psubb %%mm1, %%mm2 \n\t"\ | |
1697 "paddb %%mm7, %%mm2 \n\t"\ | |
1698 "pcmpgtb %%mm6, %%mm2 \n\t"\ | |
1699 "paddb %%mm2, %%mm0 \n\t"\ | |
1700 "movq %%mm1," #dst "(%%eax) \n\t" | |
1701 | |
1702 HDC_CHECK_AND_CPY2((%1),0) | |
1703 HDC_CHECK_AND_CPY2((%%ecx),8) | |
1704 HDC_CHECK_AND_CPY2((%%ecx, %2),16) | |
1705 HDC_CHECK_AND_CPY2((%%ecx, %2, 2),24) | |
1706 HDC_CHECK_AND_CPY2((%1, %2, 4),32) | |
1707 HDC_CHECK_AND_CPY2((%%ebx),40) | |
1708 HDC_CHECK_AND_CPY2((%%ebx, %2),48) | |
1709 HDC_CHECK_AND_CPY2((%%ebx, %2, 2),56) | |
1710 "2: \n\t" | |
1711 "psllq $8, %%mm0 \n\t" // remove dummy value | |
1712 "movq %%mm0, %%mm1 \n\t" | |
1713 "psrlw $8, %%mm0 \n\t" | |
1714 "paddb %%mm1, %%mm0 \n\t" | |
1715 "movq %%mm0, %%mm1 \n\t" | |
1716 "psrlq $16, %%mm0 \n\t" | |
1717 "paddb %%mm1, %%mm0 \n\t" | |
1718 "movq %%mm0, %%mm1 \n\t" | |
1719 "psrlq $32, %%mm0 \n\t" | |
1720 "paddb %%mm1, %%mm0 \n\t" | |
1721 "movd %%mm0, %0 \n\t" | |
1722 : "=r" (numEq) | |
1723 : "r" (src), "r" (stride) | |
1724 : "%eax", "%ebx", "%ecx" | |
1725 ); | |
1726 // printf("%d\n", numEq); | |
1727 numEq= (256 - numEq) &0xFF; | |
1728 #else | |
1729 int y; | |
1730 for(y=0; y<BLOCK_SIZE; y++) | |
1731 { | |
1732 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++; | |
1733 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++; | |
1734 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++; | |
1735 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++; | |
1736 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++; | |
1737 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++; | |
1738 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++; | |
1739 src+= stride; | |
1740 } | |
1741 #endif | |
1742 /* if(abs(numEq - asmEq) > 0) | |
1743 { | |
1744 // printf("\nasm:%d c:%d\n", asmEq, numEq); | |
1745 for(int y=0; y<8; y++) | |
1746 { | |
1747 for(int x=0; x<8; x++) | |
1748 { | |
1749 printf("%d ", src[x + y*stride]); | |
1750 } | |
1751 printf("\n"); | |
1752 } | |
1753 } | |
1754 */ | |
1755 // printf("%d\n", numEq); | |
1756 return numEq > hFlatnessThreshold; | |
1757 } | |
1758 | |
1759 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) | |
1760 { | |
1761 if(abs(src[0] - src[7]) > 2*QP) return 0; | |
1762 | |
1763 return 1; | |
1764 } | |
1765 | |
1766 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP) | |
1767 { | |
1768 #if 0 | |
1769 asm volatile( | |
1770 "leal (%0, %1), %%ecx \n\t" | |
1771 "leal (%%ecx, %1, 4), %%ebx \n\t" | |
1772 // 0 1 2 3 4 5 6 7 8 9 | |
1773 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
1774 "pxor %%mm7, %%mm7 \n\t" | |
1775 "movq bm00001000, %%mm6 \n\t" | |
1776 "movd %2, %%mm5 \n\t" // QP | |
1777 "movq %%mm5, %%mm4 \n\t" | |
1778 "paddusb %%mm5, %%mm5 \n\t" // 2QP | |
1779 "paddusb %%mm5, %%mm4 \n\t" // 3QP | |
1780 "psllq $24, %%mm4 \n\t" | |
1781 "pxor %%mm5, %%mm5 \n\t" // 0 | |
1782 "psubb %%mm4, %%mm5 \n\t" // -QP | |
1783 "leal tempBlock, %%eax \n\t" | |
1784 | |
1785 //FIXME? "unroll by 2" and mix | |
1786 #ifdef HAVE_MMX2 | |
1787 #define HDF(src, dst) \ | |
1788 "movq " #src "(%%eax), %%mm0 \n\t"\ | |
1789 "movq " #src "(%%eax), %%mm1 \n\t"\ | |
1790 "movq " #src "(%%eax), %%mm2 \n\t"\ | |
1791 "psrlq $8, %%mm1 \n\t"\ | |
1792 "psubusb %%mm1, %%mm2 \n\t"\ | |
1793 "psubusb %%mm0, %%mm1 \n\t"\ | |
1794 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ | |
1795 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
1796 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\ | |
1797 "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\ | |
1798 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\ | |
1799 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\ | |
1800 "paddb %%mm5, %%mm1 \n\t"\ | |
1801 "psubusb %%mm5, %%mm1 \n\t"\ | |
1802 "psrlw $2, %%mm1 \n\t"\ | |
1803 "pxor %%mm2, %%mm1 \n\t"\ | |
1804 "psubb %%mm2, %%mm1 \n\t"\ | |
1805 "pand %%mm6, %%mm1 \n\t"\ | |
1806 "psubb %%mm1, %%mm0 \n\t"\ | |
1807 "psllq $8, %%mm1 \n\t"\ | |
1808 "paddb %%mm1, %%mm0 \n\t"\ | |
1809 "movd %%mm0, " #dst" \n\t"\ | |
1810 "psrlq $32, %%mm0 \n\t"\ | |
1811 "movd %%mm0, 4" #dst" \n\t" | |
1812 #else | |
1813 #define HDF(src, dst)\ | |
1814 "movq " #src "(%%eax), %%mm0 \n\t"\ | |
1815 "movq %%mm0, %%mm1 \n\t"\ | |
1816 "movq %%mm0, %%mm2 \n\t"\ | |
1817 "psrlq $8, %%mm1 \n\t"\ | |
1818 "psubusb %%mm1, %%mm2 \n\t"\ | |
1819 "psubusb %%mm0, %%mm1 \n\t"\ | |
1820 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ | |
1821 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
1822 "movq %%mm1, %%mm3 \n\t"\ | |
1823 "psllq $32, %%mm3 \n\t"\ | |
1824 "movq %%mm3, %%mm4 \n\t"\ | |
1825 "psubusb %%mm1, %%mm4 \n\t"\ | |
1826 "psubb %%mm4, %%mm3 \n\t"\ | |
1827 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\ | |
1828 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\ | |
1829 "paddb %%mm5, %%mm1 \n\t"\ | |
1830 "psubusb %%mm5, %%mm1 \n\t"\ | |
1831 "psrlw $2, %%mm1 \n\t"\ | |
1832 "pxor %%mm2, %%mm1 \n\t"\ | |
1833 "psubb %%mm2, %%mm1 \n\t"\ | |
1834 "pand %%mm6, %%mm1 \n\t"\ | |
1835 "psubb %%mm1, %%mm0 \n\t"\ | |
1836 "psllq $8, %%mm1 \n\t"\ | |
1837 "paddb %%mm1, %%mm0 \n\t"\ | |
1838 "movd %%mm0, " #dst " \n\t"\ | |
1839 "psrlq $32, %%mm0 \n\t"\ | |
1840 "movd %%mm0, 4" #dst " \n\t" | |
1841 #endif | |
1842 HDF(0,(%0)) | |
1843 HDF(8,(%%ecx)) | |
1844 HDF(16,(%%ecx, %1)) | |
1845 HDF(24,(%%ecx, %1, 2)) | |
1846 HDF(32,(%0, %1, 4)) | |
1847 HDF(40,(%%ebx)) | |
1848 HDF(48,(%%ebx, %1)) | |
1849 HDF(56,(%%ebx, %1, 2)) | |
1850 : | |
1851 : "r" (dst), "r" (stride), "r" (QP) | |
1852 : "%eax", "%ebx", "%ecx" | |
1853 ); | |
1854 #else | |
1855 int y; | |
1856 for(y=0; y<BLOCK_SIZE; y++) | |
1857 { | |
1858 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]); | |
1859 | |
1860 if(ABS(middleEnergy) < 8*QP) | |
1861 { | |
1862 const int q=(dst[3] - dst[4])/2; | |
1863 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); | |
1864 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); | |
1865 | |
1866 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |
1867 d= MAX(d, 0); | |
1868 | |
1869 d= (5*d + 32) >> 6; | |
1870 d*= SIGN(-middleEnergy); | |
1871 | |
1872 if(q>0) | |
1873 { | |
1874 d= d<0 ? 0 : d; | |
1875 d= d>q ? q : d; | |
1876 } | |
1877 else | |
1878 { | |
1879 d= d>0 ? 0 : d; | |
1880 d= d<q ? q : d; | |
1881 } | |
1882 | |
1883 dst[3]-= d; | |
1884 dst[4]+= d; | |
1885 } | |
1886 dst+= stride; | |
1887 } | |
1888 #endif | |
1889 } | |
1890 | |
1891 /** | |
1892 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block) | |
1893 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) | |
1894 * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version) | |
1895 */ | |
1896 static inline void doHorizLowPass(uint8_t dst[], int stride, int QP) | |
1897 { | |
1898 | |
1899 #if 0 | |
1900 asm volatile( | |
1901 "leal (%0, %1), %%ecx \n\t" | |
1902 "leal (%%ecx, %1, 4), %%ebx \n\t" | |
1903 // 0 1 2 3 4 5 6 7 8 9 | |
1904 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
1905 "pxor %%mm7, %%mm7 \n\t" | |
1906 "leal tempBlock, %%eax \n\t" | |
1907 /* | |
1908 #define HLP1 "movq (%0), %%mm0 \n\t"\ | |
1909 "movq %%mm0, %%mm1 \n\t"\ | |
1910 "psllq $8, %%mm0 \n\t"\ | |
1911 PAVGB(%%mm1, %%mm0)\ | |
1912 "psrlw $8, %%mm0 \n\t"\ | |
1913 "pxor %%mm1, %%mm1 \n\t"\ | |
1914 "packuswb %%mm1, %%mm0 \n\t"\ | |
1915 "movq %%mm0, %%mm1 \n\t"\ | |
1916 "movq %%mm0, %%mm2 \n\t"\ | |
1917 "psllq $32, %%mm0 \n\t"\ | |
1918 "paddb %%mm0, %%mm1 \n\t"\ | |
1919 "psllq $16, %%mm2 \n\t"\ | |
1920 PAVGB(%%mm2, %%mm0)\ | |
1921 "movq %%mm0, %%mm3 \n\t"\ | |
1922 "pand bm11001100, %%mm0 \n\t"\ | |
1923 "paddusb %%mm0, %%mm3 \n\t"\ | |
1924 "psrlq $8, %%mm3 \n\t"\ | |
1925 PAVGB(%%mm1, %%mm4)\ | |
1926 PAVGB(%%mm3, %%mm2)\ | |
1927 "psrlq $16, %%mm2 \n\t"\ | |
1928 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
1929 "movq %%mm2, (%0) \n\t"\ | |
1930 | |
1931 #define HLP2 "movq (%0), %%mm0 \n\t"\ | |
1932 "movq %%mm0, %%mm1 \n\t"\ | |
1933 "psllq $8, %%mm0 \n\t"\ | |
1934 PAVGB(%%mm1, %%mm0)\ | |
1935 "psrlw $8, %%mm0 \n\t"\ | |
1936 "pxor %%mm1, %%mm1 \n\t"\ | |
1937 "packuswb %%mm1, %%mm0 \n\t"\ | |
1938 "movq %%mm0, %%mm2 \n\t"\ | |
1939 "psllq $32, %%mm0 \n\t"\ | |
1940 "psllq $16, %%mm2 \n\t"\ | |
1941 PAVGB(%%mm2, %%mm0)\ | |
1942 "movq %%mm0, %%mm3 \n\t"\ | |
1943 "pand bm11001100, %%mm0 \n\t"\ | |
1944 "paddusb %%mm0, %%mm3 \n\t"\ | |
1945 "psrlq $8, %%mm3 \n\t"\ | |
1946 PAVGB(%%mm3, %%mm2)\ | |
1947 "psrlq $16, %%mm2 \n\t"\ | |
1948 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
1949 "movq %%mm2, (%0) \n\t"\ | |
1950 */ | |
1951 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16 | |
1952 /* | |
1953 Implemented Exact 7-Tap | |
1954 9421 A321 | |
1955 36421 64321 | |
1956 334321 = | |
1957 1234321 = | |
1958 1234321 = | |
1959 123433 = | |
1960 12463 12346 | |
1961 1249 123A | |
1962 | |
1963 */ | |
1964 | |
1965 #ifdef HAVE_MMX2 | |
1966 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\ | |
1967 "movq %%mm0, %%mm1 \n\t"\ | |
1968 "movq %%mm0, %%mm2 \n\t"\ | |
1969 "movq %%mm0, %%mm3 \n\t"\ | |
1970 "movq %%mm0, %%mm4 \n\t"\ | |
1971 "psllq $8, %%mm1 \n\t"\ | |
1972 "psrlq $8, %%mm2 \n\t"\ | |
1973 "pand bm00000001, %%mm3 \n\t"\ | |
1974 "pand bm10000000, %%mm4 \n\t"\ | |
1975 "por %%mm3, %%mm1 \n\t"\ | |
1976 "por %%mm4, %%mm2 \n\t"\ | |
1977 PAVGB(%%mm2, %%mm1)\ | |
1978 PAVGB(%%mm1, %%mm0)\ | |
1979 \ | |
1980 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\ | |
1981 "pshufw $0x90, %%mm0, %%mm4 \n\t"\ | |
1982 PAVGB(%%mm3, %%mm4)\ | |
1983 PAVGB(%%mm4, %%mm0)\ | |
1984 "movd %%mm0, (%0) \n\t"\ | |
1985 "psrlq $32, %%mm0 \n\t"\ | |
1986 "movd %%mm0, 4(%0) \n\t" | |
1987 #else | |
1988 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\ | |
1989 "movq %%mm0, %%mm1 \n\t"\ | |
1990 "movq %%mm0, %%mm2 \n\t"\ | |
1991 "movd -4(%0), %%mm3 \n\t" /*0001000*/\ | |
1992 "movd 8(%0), %%mm4 \n\t" /*0001000*/\ | |
1993 "psllq $8, %%mm1 \n\t"\ | |
1994 "psrlq $8, %%mm2 \n\t"\ | |
1995 "psrlq $24, %%mm3 \n\t"\ | |
1996 "psllq $56, %%mm4 \n\t"\ | |
1997 "por %%mm3, %%mm1 \n\t"\ | |
1998 "por %%mm4, %%mm2 \n\t"\ | |
1999 PAVGB(%%mm2, %%mm1)\ | |
2000 PAVGB(%%mm1, %%mm0)\ | |
2001 \ | |
2002 "movq %%mm0, %%mm3 \n\t"\ | |
2003 "movq %%mm0, %%mm4 \n\t"\ | |
2004 "movq %%mm0, %%mm5 \n\t"\ | |
2005 "psrlq $16, %%mm3 \n\t"\ | |
2006 "psllq $16, %%mm4 \n\t"\ | |
2007 "pand bm11000000, %%mm5 \n\t"\ | |
2008 "por %%mm5, %%mm3 \n\t"\ | |
2009 "movq %%mm0, %%mm5 \n\t"\ | |
2010 "pand bm00000011, %%mm5 \n\t"\ | |
2011 "por %%mm5, %%mm4 \n\t"\ | |
2012 PAVGB(%%mm3, %%mm4)\ | |
2013 PAVGB(%%mm4, %%mm0)\ | |
2014 "movd %%mm0, (%0) \n\t"\ | |
2015 "psrlq $32, %%mm0 \n\t"\ | |
2016 "movd %%mm0, 4(%0) \n\t" | |
2017 #endif | |
2018 | |
2019 /* uses the 7-Tap Filter: 1112111 */ | |
2020 #define NEW_HLP(src, dst)\ | |
2021 "movq " #src "(%%eax), %%mm1 \n\t"\ | |
2022 "movq " #src "(%%eax), %%mm2 \n\t"\ | |
2023 "psllq $8, %%mm1 \n\t"\ | |
2024 "psrlq $8, %%mm2 \n\t"\ | |
2025 "movd -4" #dst ", %%mm3 \n\t" /*0001000*/\ | |
2026 "movd 8" #dst ", %%mm4 \n\t" /*0001000*/\ | |
2027 "psrlq $24, %%mm3 \n\t"\ | |
2028 "psllq $56, %%mm4 \n\t"\ | |
2029 "por %%mm3, %%mm1 \n\t"\ | |
2030 "por %%mm4, %%mm2 \n\t"\ | |
2031 "movq %%mm1, %%mm5 \n\t"\ | |
2032 PAVGB(%%mm2, %%mm1)\ | |
2033 "movq " #src "(%%eax), %%mm0 \n\t"\ | |
2034 PAVGB(%%mm1, %%mm0)\ | |
2035 "psllq $8, %%mm5 \n\t"\ | |
2036 "psrlq $8, %%mm2 \n\t"\ | |
2037 "por %%mm3, %%mm5 \n\t"\ | |
2038 "por %%mm4, %%mm2 \n\t"\ | |
2039 "movq %%mm5, %%mm1 \n\t"\ | |
2040 PAVGB(%%mm2, %%mm5)\ | |
2041 "psllq $8, %%mm1 \n\t"\ | |
2042 "psrlq $8, %%mm2 \n\t"\ | |
2043 "por %%mm3, %%mm1 \n\t"\ | |
2044 "por %%mm4, %%mm2 \n\t"\ | |
2045 PAVGB(%%mm2, %%mm1)\ | |
2046 PAVGB(%%mm1, %%mm5)\ | |
2047 PAVGB(%%mm5, %%mm0)\ | |
2048 "movd %%mm0, " #dst " \n\t"\ | |
2049 "psrlq $32, %%mm0 \n\t"\ | |
2050 "movd %%mm0, 4" #dst " \n\t" | |
2051 | |
2052 /* uses the 9-Tap Filter: 112242211 */ | |
2053 #define NEW_HLP2(i)\ | |
2054 "movq " #i "(%%eax), %%mm0 \n\t" /*0001000*/\ | |
2055 "movq %%mm0, %%mm1 \n\t" /*0001000*/\ | |
2056 "movq %%mm0, %%mm2 \n\t" /*0001000*/\ | |
2057 "movd -4(%0), %%mm3 \n\t" /*0001000*/\ | |
2058 "movd 8(%0), %%mm4 \n\t" /*0001000*/\ | |
2059 "psllq $8, %%mm1 \n\t"\ | |
2060 "psrlq $8, %%mm2 \n\t"\ | |
2061 "psrlq $24, %%mm3 \n\t"\ | |
2062 "psllq $56, %%mm4 \n\t"\ | |
2063 "por %%mm3, %%mm1 \n\t" /*0010000*/\ | |
2064 "por %%mm4, %%mm2 \n\t" /*0000100*/\ | |
2065 "movq %%mm1, %%mm5 \n\t" /*0010000*/\ | |
2066 PAVGB(%%mm2, %%mm1) /*0010100*/\ | |
2067 PAVGB(%%mm1, %%mm0) /*0012100*/\ | |
2068 "psllq $8, %%mm5 \n\t"\ | |
2069 "psrlq $8, %%mm2 \n\t"\ | |
2070 "por %%mm3, %%mm5 \n\t" /*0100000*/\ | |
2071 "por %%mm4, %%mm2 \n\t" /*0000010*/\ | |
2072 "movq %%mm5, %%mm1 \n\t" /*0100000*/\ | |
2073 PAVGB(%%mm2, %%mm5) /*0100010*/\ | |
2074 "psllq $8, %%mm1 \n\t"\ | |
2075 "psrlq $8, %%mm2 \n\t"\ | |
2076 "por %%mm3, %%mm1 \n\t" /*1000000*/\ | |
2077 "por %%mm4, %%mm2 \n\t" /*0000001*/\ | |
2078 "movq %%mm1, %%mm6 \n\t" /*1000000*/\ | |
2079 PAVGB(%%mm2, %%mm1) /*1000001*/\ | |
2080 "psllq $8, %%mm6 \n\t"\ | |
2081 "psrlq $8, %%mm2 \n\t"\ | |
2082 "por %%mm3, %%mm6 \n\t"/*100000000*/\ | |
2083 "por %%mm4, %%mm2 \n\t"/*000000001*/\ | |
2084 PAVGB(%%mm2, %%mm6) /*100000001*/\ | |
2085 PAVGB(%%mm6, %%mm1) /*110000011*/\ | |
2086 PAVGB(%%mm1, %%mm5) /*112000211*/\ | |
2087 PAVGB(%%mm5, %%mm0) /*112242211*/\ | |
2088 "movd %%mm0, (%0) \n\t"\ | |
2089 "psrlq $32, %%mm0 \n\t"\ | |
2090 "movd %%mm0, 4(%0) \n\t" | |
2091 | |
2092 #define HLP(src, dst) NEW_HLP(src, dst) | |
2093 | |
2094 HLP(0, (%0)) | |
2095 HLP(8, (%%ecx)) | |
2096 HLP(16, (%%ecx, %1)) | |
2097 HLP(24, (%%ecx, %1, 2)) | |
2098 HLP(32, (%0, %1, 4)) | |
2099 HLP(40, (%%ebx)) | |
2100 HLP(48, (%%ebx, %1)) | |
2101 HLP(56, (%%ebx, %1, 2)) | |
2102 | |
2103 : | |
2104 : "r" (dst), "r" (stride) | |
2105 : "%eax", "%ebx", "%ecx" | |
2106 ); | |
2107 | |
2108 #else | |
2109 int y; | |
2110 for(y=0; y<BLOCK_SIZE; y++) | |
2111 { | |
2112 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0]; | |
2113 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; | |
2114 | |
2115 int sums[9]; | |
2116 sums[0] = first + dst[0]; | |
2117 sums[1] = dst[0] + dst[1]; | |
2118 sums[2] = dst[1] + dst[2]; | |
2119 sums[3] = dst[2] + dst[3]; | |
2120 sums[4] = dst[3] + dst[4]; | |
2121 sums[5] = dst[4] + dst[5]; | |
2122 sums[6] = dst[5] + dst[6]; | |
2123 sums[7] = dst[6] + dst[7]; | |
2124 sums[8] = dst[7] + last; | |
2125 | |
2126 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; | |
2127 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4; | |
2128 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4; | |
2129 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4; | |
2130 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4; | |
2131 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4; | |
2132 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4; | |
2133 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; | |
2134 | |
2135 dst+= stride; | |
2136 } | 1437 } |
2137 #endif | 1438 #endif |
2138 } | 1439 } |
2139 | 1440 |
2140 static inline void dering(uint8_t src[], int stride, int QP) | 1441 static inline void dering(uint8_t src[], int stride, int QP) |
3531 | 2832 |
3532 horizontal_size >>= 1; | 2833 horizontal_size >>= 1; |
3533 vertical_size >>= 1; | 2834 vertical_size >>= 1; |
3534 src_stride >>= 1; | 2835 src_stride >>= 1; |
3535 dst_stride >>= 1; | 2836 dst_stride >>= 1; |
3536 // mode&= ~(LINEAR_IPOL_DEINT_FILTER | LINEAR_BLEND_DEINT_FILTER | | |
3537 // MEDIAN_DEINT_FILTER | CUBIC_IPOL_DEINT_FILTER); | |
3538 | 2837 |
3539 if(1) | 2838 if(1) |
3540 { | 2839 { |
3541 postProcess(src[1], src_stride, dst[1], dst_stride, | 2840 postProcess(src[1], src_stride, dst[1], dst_stride, |
3542 horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode); | 2841 horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode); |
3636 * Copies a block from src to dst and fixes the blacklevel | 2935 * Copies a block from src to dst and fixes the blacklevel |
3637 * numLines must be a multiple of 4 | 2936 * numLines must be a multiple of 4 |
3638 * levelFix == 0 -> dont touch the brighness & contrast | 2937 * levelFix == 0 -> dont touch the brighness & contrast |
3639 */ | 2938 */ |
3640 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, | 2939 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, |
3641 int numLines, int levelFix) | 2940 int levelFix) |
3642 { | 2941 { |
3643 #ifndef HAVE_MMX | 2942 #ifndef HAVE_MMX |
3644 int i; | 2943 int i; |
3645 #endif | 2944 #endif |
3646 if(levelFix) | 2945 if(levelFix) |
3693 :"r" (srcStride), | 2992 :"r" (srcStride), |
3694 "r" (dstStride) | 2993 "r" (dstStride) |
3695 : "%eax", "%ebx" | 2994 : "%eax", "%ebx" |
3696 ); | 2995 ); |
3697 #else | 2996 #else |
3698 for(i=0; i<numLines; i++) | 2997 for(i=0; i<8; i++) |
3699 memcpy( &(dst[dstStride*i]), | 2998 memcpy( &(dst[dstStride*i]), |
3700 &(src[srcStride*i]), BLOCK_SIZE); | 2999 &(src[srcStride*i]), BLOCK_SIZE); |
3701 #endif | 3000 #endif |
3702 } | 3001 } |
3703 else | 3002 else |
3704 { | 3003 { |
3705 #ifdef HAVE_MMX | 3004 #ifdef HAVE_MMX |
3706 asm volatile( | 3005 asm volatile( |
3707 "movl %4, %%eax \n\t" | |
3708 "movl %%eax, temp0\n\t" | |
3709 "pushl %0 \n\t" | 3006 "pushl %0 \n\t" |
3710 "pushl %1 \n\t" | 3007 "pushl %1 \n\t" |
3711 "leal (%2,%2), %%eax \n\t" | 3008 "leal (%2,%2), %%eax \n\t" |
3712 "leal (%3,%3), %%ebx \n\t" | 3009 "leal (%3,%3), %%ebx \n\t" |
3713 "movq packedYOffset, %%mm2 \n\t" | |
3714 "movq packedYScale, %%mm3 \n\t" | |
3715 | 3010 |
3716 #define SIMPLE_CPY \ | 3011 #define SIMPLE_CPY \ |
3717 "movq (%0), %%mm0 \n\t"\ | 3012 "movq (%0), %%mm0 \n\t"\ |
3718 "movq (%0,%2), %%mm1 \n\t"\ | 3013 "movq (%0,%2), %%mm1 \n\t"\ |
3719 "movq %%mm0, (%1) \n\t"\ | 3014 "movq %%mm0, (%1) \n\t"\ |
3720 "movq %%mm1, (%1, %3) \n\t"\ | 3015 "movq %%mm1, (%1, %3) \n\t"\ |
3721 | 3016 |
3722 "1: \n\t" | |
3723 SIMPLE_CPY | 3017 SIMPLE_CPY |
3724 "addl %%eax, %0 \n\t" | 3018 "addl %%eax, %0 \n\t" |
3725 "addl %%ebx, %1 \n\t" | 3019 "addl %%ebx, %1 \n\t" |
3726 SIMPLE_CPY | 3020 SIMPLE_CPY |
3727 "addl %%eax, %0 \n\t" | 3021 "addl %%eax, %0 \n\t" |
3728 "addl %%ebx, %1 \n\t" | 3022 "addl %%ebx, %1 \n\t" |
3729 "decl temp0 \n\t" | 3023 SIMPLE_CPY |
3730 "jnz 1b \n\t" | 3024 "addl %%eax, %0 \n\t" |
3025 "addl %%ebx, %1 \n\t" | |
3026 SIMPLE_CPY | |
3731 | 3027 |
3732 "popl %1 \n\t" | 3028 "popl %1 \n\t" |
3733 "popl %0 \n\t" | 3029 "popl %0 \n\t" |
3734 : : "r" (src), | 3030 : : "r" (src), |
3735 "r" (dst), | 3031 "r" (dst), |
3736 "r" (srcStride), | 3032 "r" (srcStride), |
3737 "r" (dstStride), | 3033 "r" (dstStride) |
3738 "m" (numLines>>2) | |
3739 : "%eax", "%ebx" | 3034 : "%eax", "%ebx" |
3740 ); | 3035 ); |
3741 #else | 3036 #else |
3742 for(i=0; i<numLines; i++) | 3037 for(i=0; i<8; i++) |
3743 memcpy( &(dst[dstStride*i]), | 3038 memcpy( &(dst[dstStride*i]), |
3744 &(src[srcStride*i]), BLOCK_SIZE); | 3039 &(src[srcStride*i]), BLOCK_SIZE); |
3745 #endif | 3040 #endif |
3746 } | 3041 } |
3747 } | 3042 } |
3772 | 3067 |
3773 /* Temporal noise reducing buffers */ | 3068 /* Temporal noise reducing buffers */ |
3774 static uint8_t *tempBlured[3]= {NULL,NULL,NULL}; | 3069 static uint8_t *tempBlured[3]= {NULL,NULL,NULL}; |
3775 static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL}; | 3070 static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL}; |
3776 | 3071 |
3072 int copyAhead; | |
3073 | |
3777 #ifdef PP_FUNNY_STRIDE | 3074 #ifdef PP_FUNNY_STRIDE |
3778 uint8_t *dstBlockPtrBackup; | 3075 uint8_t *dstBlockPtrBackup; |
3779 uint8_t *srcBlockPtrBackup; | 3076 uint8_t *srcBlockPtrBackup; |
3780 #endif | 3077 #endif |
3781 | 3078 |
3790 #ifdef HAVE_MMX | 3087 #ifdef HAVE_MMX |
3791 maxTmpNoise[0]= ppMode->maxTmpNoise[0]; | 3088 maxTmpNoise[0]= ppMode->maxTmpNoise[0]; |
3792 maxTmpNoise[1]= ppMode->maxTmpNoise[1]; | 3089 maxTmpNoise[1]= ppMode->maxTmpNoise[1]; |
3793 maxTmpNoise[2]= ppMode->maxTmpNoise[2]; | 3090 maxTmpNoise[2]= ppMode->maxTmpNoise[2]; |
3794 #endif | 3091 #endif |
3092 | |
3093 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; | |
3094 else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14; | |
3095 else if( (mode & V_DEBLOCK) | |
3096 || (mode & LINEAR_IPOL_DEINT_FILTER) | |
3097 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13; | |
3098 else if(mode & V_X1_FILTER) copyAhead=11; | |
3099 else if(mode & V_RK1_FILTER) copyAhead=10; | |
3100 else if(mode & DERING) copyAhead=9; | |
3101 else copyAhead=8; | |
3102 | |
3103 copyAhead-= 8; | |
3795 | 3104 |
3796 if(tempDst==NULL) | 3105 if(tempDst==NULL) |
3797 { | 3106 { |
3798 tempDst= (uint8_t*)memalign(8, 1024*24); | 3107 tempDst= (uint8_t*)memalign(8, 1024*24); |
3799 tempSrc= (uint8_t*)memalign(8, 1024*24); | 3108 tempSrc= (uint8_t*)memalign(8, 1024*24); |
3896 for(x=0; x<width; x+=BLOCK_SIZE) | 3205 for(x=0; x<width; x+=BLOCK_SIZE) |
3897 { | 3206 { |
3898 | 3207 |
3899 #ifdef HAVE_MMX2 | 3208 #ifdef HAVE_MMX2 |
3900 /* | 3209 /* |
3901 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | |
3902 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
3903 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
3904 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
3905 */ | |
3906 /* | |
3907 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | 3210 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
3908 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | 3211 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); |
3909 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | 3212 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); |
3910 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | 3213 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); |
3911 */ | 3214 */ |
3912 | 3215 |
3913 asm( | 3216 asm( |
3914 "movl %4, %%eax \n\t" | 3217 "movl %4, %%eax \n\t" |
3915 "shrl $2, %%eax \n\t" | 3218 "shrl $2, %%eax \n\t" |
3916 "andl $6, %%eax \n\t" | 3219 "andl $6, %%eax \n\t" |
3917 "addl $8, %%eax \n\t" | 3220 "addl %5, %%eax \n\t" |
3918 "movl %%eax, %%ebx \n\t" | 3221 "movl %%eax, %%ebx \n\t" |
3919 "imul %1, %%eax \n\t" | 3222 "imul %1, %%eax \n\t" |
3920 "imul %3, %%ebx \n\t" | 3223 "imul %3, %%ebx \n\t" |
3921 "prefetchnta 32(%%eax, %0) \n\t" | 3224 "prefetchnta 32(%%eax, %0) \n\t" |
3922 "prefetcht0 32(%%ebx, %2) \n\t" | 3225 "prefetcht0 32(%%ebx, %2) \n\t" |
3923 "addl %1, %%eax \n\t" | 3226 "addl %1, %%eax \n\t" |
3924 "addl %3, %%ebx \n\t" | 3227 "addl %3, %%ebx \n\t" |
3925 "prefetchnta 32(%%eax, %0) \n\t" | 3228 "prefetchnta 32(%%eax, %0) \n\t" |
3926 "prefetcht0 32(%%ebx, %2) \n\t" | 3229 "prefetcht0 32(%%ebx, %2) \n\t" |
3927 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), | 3230 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
3928 "m" (x) | 3231 "m" (x), "m" (copyAhead) |
3929 : "%eax", "%ebx" | 3232 : "%eax", "%ebx" |
3930 ); | 3233 ); |
3931 | 3234 |
3932 #elif defined(HAVE_3DNOW) | 3235 #elif defined(HAVE_3DNOW) |
3933 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | 3236 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... |
3936 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | 3239 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
3937 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | 3240 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
3938 */ | 3241 */ |
3939 #endif | 3242 #endif |
3940 | 3243 |
3941 blockCopy(dstBlock + dstStride*8, dstStride, | 3244 blockCopy(dstBlock + dstStride*copyAhead, dstStride, |
3942 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX); | 3245 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); |
3943 | 3246 |
3944 if(mode & LINEAR_IPOL_DEINT_FILTER) | 3247 if(mode & LINEAR_IPOL_DEINT_FILTER) |
3945 deInterlaceInterpolateLinear(dstBlock, dstStride); | 3248 deInterlaceInterpolateLinear(dstBlock, dstStride); |
3946 else if(mode & LINEAR_BLEND_DEINT_FILTER) | 3249 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
3947 deInterlaceBlendLinear(dstBlock, dstStride); | 3250 deInterlaceBlendLinear(dstBlock, dstStride); |
3953 deInterlaceBlendCubic(dstBlock, dstStride); | 3256 deInterlaceBlendCubic(dstBlock, dstStride); |
3954 */ | 3257 */ |
3955 dstBlock+=8; | 3258 dstBlock+=8; |
3956 srcBlock+=8; | 3259 srcBlock+=8; |
3957 } | 3260 } |
3958 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, 8*dstStride ); | 3261 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride ); |
3959 } | 3262 } |
3960 | 3263 |
3961 for(y=0; y<height; y+=BLOCK_SIZE) | 3264 for(y=0; y<height; y+=BLOCK_SIZE) |
3962 { | 3265 { |
3963 //1% speedup if these are here instead of the inner loop | 3266 //1% speedup if these are here instead of the inner loop |
3974 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards | 3277 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards |
3975 if not than use a temporary buffer */ | 3278 if not than use a temporary buffer */ |
3976 if(y+15 >= height) | 3279 if(y+15 >= height) |
3977 { | 3280 { |
3978 int i; | 3281 int i; |
3979 /* copy from line 8 to 15 of src, these will be copied with | 3282 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with |
3980 blockcopy to dst later */ | 3283 blockcopy to dst later */ |
3981 memcpy(tempSrc + srcStride*8, srcBlock + srcStride*8, | 3284 memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, |
3982 srcStride*MAX(height-y-8, 0) ); | 3285 srcStride*MAX(height-y-copyAhead, 0) ); |
3983 | 3286 |
3984 /* duplicate last line of src to fill the void upto line 15 */ | 3287 /* duplicate last line of src to fill the void upto line (copyAhead+7) */ |
3985 for(i=MAX(height-y, 8); i<=15; i++) | 3288 for(i=MAX(height-y, 8); i<copyAhead+8; i++) |
3986 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride); | 3289 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride); |
3987 | 3290 |
3988 /* copy up to 9 lines of dst (line -1 to 7)*/ | 3291 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ |
3989 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 9) ); | 3292 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) ); |
3990 | 3293 |
3991 /* duplicate last line of dst to fill the void upto line 8 */ | 3294 /* duplicate last line of dst to fill the void upto line (copyAhead) */ |
3992 for(i=height-y+1; i<=8; i++) | 3295 for(i=height-y+1; i<=copyAhead; i++) |
3993 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride); | 3296 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride); |
3994 | 3297 |
3995 dstBlock= tempDst + dstStride; | 3298 dstBlock= tempDst + dstStride; |
3996 srcBlock= tempSrc; | 3299 srcBlock= tempSrc; |
3997 } | 3300 } |
4039 T0= rdtsc(); | 3342 T0= rdtsc(); |
4040 #endif | 3343 #endif |
4041 | 3344 |
4042 #ifdef HAVE_MMX2 | 3345 #ifdef HAVE_MMX2 |
4043 /* | 3346 /* |
4044 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | |
4045 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
4046 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
4047 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
4048 */ | |
4049 /* | |
4050 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | 3347 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
4051 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | 3348 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); |
4052 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | 3349 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); |
4053 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | 3350 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); |
4054 */ | 3351 */ |
4055 | 3352 |
4056 asm( | 3353 asm( |
4057 "movl %4, %%eax \n\t" | 3354 "movl %4, %%eax \n\t" |
4058 "shrl $2, %%eax \n\t" | 3355 "shrl $2, %%eax \n\t" |
4059 "andl $6, %%eax \n\t" | 3356 "andl $6, %%eax \n\t" |
4060 "addl $8, %%eax \n\t" | 3357 "addl %5, %%eax \n\t" |
4061 "movl %%eax, %%ebx \n\t" | 3358 "movl %%eax, %%ebx \n\t" |
4062 "imul %1, %%eax \n\t" | 3359 "imul %1, %%eax \n\t" |
4063 "imul %3, %%ebx \n\t" | 3360 "imul %3, %%ebx \n\t" |
4064 "prefetchnta 32(%%eax, %0) \n\t" | 3361 "prefetchnta 32(%%eax, %0) \n\t" |
4065 "prefetcht0 32(%%ebx, %2) \n\t" | 3362 "prefetcht0 32(%%ebx, %2) \n\t" |
4066 "addl %1, %%eax \n\t" | 3363 "addl %1, %%eax \n\t" |
4067 "addl %3, %%ebx \n\t" | 3364 "addl %3, %%ebx \n\t" |
4068 "prefetchnta 32(%%eax, %0) \n\t" | 3365 "prefetchnta 32(%%eax, %0) \n\t" |
4069 "prefetcht0 32(%%ebx, %2) \n\t" | 3366 "prefetcht0 32(%%ebx, %2) \n\t" |
4070 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), | 3367 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
4071 "m" (x) | 3368 "m" (x), "m" (copyAhead) |
4072 : "%eax", "%ebx" | 3369 : "%eax", "%ebx" |
4073 ); | 3370 ); |
4074 | 3371 |
4075 #elif defined(HAVE_3DNOW) | 3372 #elif defined(HAVE_3DNOW) |
4076 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | 3373 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... |
4098 dstBlock= tempDstBlock; | 3395 dstBlock= tempDstBlock; |
4099 srcBlock= tempSrcBlock; | 3396 srcBlock= tempSrcBlock; |
4100 } | 3397 } |
4101 #endif | 3398 #endif |
4102 | 3399 |
4103 blockCopy(dstBlock + dstStride*8, dstStride, | 3400 blockCopy(dstBlock + dstStride*copyAhead, dstStride, |
4104 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX); | 3401 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); |
4105 | 3402 |
4106 if(mode & LINEAR_IPOL_DEINT_FILTER) | 3403 if(mode & LINEAR_IPOL_DEINT_FILTER) |
4107 deInterlaceInterpolateLinear(dstBlock, dstStride); | 3404 deInterlaceInterpolateLinear(dstBlock, dstStride); |
4108 else if(mode & LINEAR_BLEND_DEINT_FILTER) | 3405 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
4109 deInterlaceBlendLinear(dstBlock, dstStride); | 3406 deInterlaceBlendLinear(dstBlock, dstStride); |
4158 vertRK1Filter(tempBlock1, 16, QP); | 3455 vertRK1Filter(tempBlock1, 16, QP); |
4159 else if(mode & H_X1_FILTER) | 3456 else if(mode & H_X1_FILTER) |
4160 vertX1Filter(tempBlock1, 16, QP); | 3457 vertX1Filter(tempBlock1, 16, QP); |
4161 else if(mode & H_DEBLOCK) | 3458 else if(mode & H_DEBLOCK) |
4162 { | 3459 { |
4163 if( isVertDC(tempBlock1, 16)) | 3460 if( isVertDC(tempBlock1, 16) ) |
4164 { | 3461 { |
4165 if(isVertMinMaxOk(tempBlock1, 16, QP)) | 3462 if(isVertMinMaxOk(tempBlock1, 16, QP)) |
4166 doVertLowPass(tempBlock1, 16, QP); | 3463 doVertLowPass(tempBlock1, 16, QP); |
4167 } | 3464 } |
4168 else | 3465 else |
4250 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) ); | 3547 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) ); |
4251 } | 3548 } |
4252 /* | 3549 /* |
4253 for(x=0; x<width; x+=32) | 3550 for(x=0; x<width; x+=32) |
4254 { | 3551 { |
4255 int i; | 3552 volatile int i; |
4256 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] | 3553 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] |
4257 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] | 3554 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] |
4258 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride] | 3555 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; |
4259 + dstBlock[x +13*dstStride] + dstBlock[x +14*dstStride] | 3556 // + dstBlock[x +13*dstStride] |
4260 + dstBlock[x +15*dstStride]; | 3557 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; |
4261 } | 3558 }*/ |
4262 */ } | 3559 } |
4263 #ifdef HAVE_3DNOW | 3560 #ifdef HAVE_3DNOW |
4264 asm volatile("femms"); | 3561 asm volatile("femms"); |
4265 #elif defined (HAVE_MMX) | 3562 #elif defined (HAVE_MMX) |
4266 asm volatile("emms"); | 3563 asm volatile("emms"); |
4267 #endif | 3564 #endif |