comparison postproc/postprocess.c @ 3031:86e1a0f4f0bc

cleanup precopy fewer lines from src to dst if possible speedup (due to cleanup of blockcopy)
author michael
date Tue, 20 Nov 2001 17:47:52 +0000
parents 71384f064a3e
children 6de073cf52b5
comparison
equal deleted inserted replaced
3030:1dbc569b6528 3031:86e1a0f4f0bc
60 split this huge file 60 split this huge file
61 border remover 61 border remover
62 optimize c versions 62 optimize c versions
63 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks 63 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
64 smart blur 64 smart blur
65 commandline option for the deblock thresholds
65 ... 66 ...
66 */ 67 */
67 68
68 //Changelog: use the CVS log 69 //Changelog: use the CVS log
69 70
856 } 857 }
857 */ 858 */
858 #endif 859 #endif
859 } 860 }
860 861
861 /**
862 * Experimental Filter 1 (Horizontal)
863 * will not damage linear gradients
864 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
865 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
866 * MMX2 version does correct clipping C version doesnt
867 * not identical with the vertical one
868 */
869 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
870 {
871 int y;
872 static uint64_t *lut= NULL;
873 if(lut==NULL)
874 {
875 int i;
876 lut= (uint64_t*)memalign(8, 256*8);
877 for(i=0; i<256; i++)
878 {
879 int v= i < 128 ? 2*i : 2*(i-256);
880 /*
881 //Simulate 112242211 9-Tap filter
882 uint64_t a= (v/16) & 0xFF;
883 uint64_t b= (v/8) & 0xFF;
884 uint64_t c= (v/4) & 0xFF;
885 uint64_t d= (3*v/8) & 0xFF;
886 */
887 //Simulate piecewise linear interpolation
888 uint64_t a= (v/16) & 0xFF;
889 uint64_t b= (v*3/16) & 0xFF;
890 uint64_t c= (v*5/16) & 0xFF;
891 uint64_t d= (7*v/16) & 0xFF;
892 uint64_t A= (0x100 - a)&0xFF;
893 uint64_t B= (0x100 - b)&0xFF;
894 uint64_t C= (0x100 - c)&0xFF;
895 uint64_t D= (0x100 - c)&0xFF;
896
897 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
898 (D<<24) | (C<<16) | (B<<8) | (A);
899 //lut[i] = (v<<32) | (v<<24);
900 }
901 }
902
903 #if 0
904 asm volatile(
905 "pxor %%mm7, %%mm7 \n\t" // 0
906 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
907 "leal (%0, %1), %%eax \n\t"
908 "leal (%%eax, %1, 4), %%ebx \n\t"
909
910 "movq b80, %%mm6 \n\t"
911 "movd pQPb, %%mm5 \n\t" // QP
912 "movq %%mm5, %%mm4 \n\t"
913 "paddusb %%mm5, %%mm5 \n\t" // 2QP
914 "paddusb %%mm5, %%mm4 \n\t" // 3QP
915 "pxor %%mm5, %%mm5 \n\t" // 0
916 "psubb %%mm4, %%mm5 \n\t" // -3QP
917 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
918 "psllq $24, %%mm5 \n\t"
919
920 // 0 1 2 3 4 5 6 7 8 9
921 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
922
923 #define HX1old(a) \
924 "movd " #a ", %%mm0 \n\t"\
925 "movd 4" #a ", %%mm1 \n\t"\
926 "punpckldq %%mm1, %%mm0 \n\t"\
927 "movq %%mm0, %%mm1 \n\t"\
928 "movq %%mm0, %%mm2 \n\t"\
929 "psrlq $8, %%mm1 \n\t"\
930 "psubusb %%mm1, %%mm2 \n\t"\
931 "psubusb %%mm0, %%mm1 \n\t"\
932 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
933 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
934 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
935 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
936 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
937 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
938 "paddb %%mm5, %%mm1 \n\t"\
939 "psubusb %%mm5, %%mm1 \n\t"\
940 PAVGB(%%mm7, %%mm1)\
941 "pxor %%mm2, %%mm1 \n\t"\
942 "psubb %%mm2, %%mm1 \n\t"\
943 "psrlq $24, %%mm1 \n\t"\
944 "movd %%mm1, %%ecx \n\t"\
945 "paddb %%mm6, %%mm0 \n\t"\
946 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
947 "paddb %%mm6, %%mm0 \n\t"\
948 "movq %%mm0, " #a " \n\t"\
949
950 /*
951 HX1old((%0))
952 HX1old((%%eax))
953 HX1old((%%eax, %1))
954 HX1old((%%eax, %1, 2))
955 HX1old((%0, %1, 4))
956 HX1old((%%ebx))
957 HX1old((%%ebx, %1))
958 HX1old((%%ebx, %1, 2))
959 */
960
961 //FIXME add some comments, its unreadable ...
962 #define HX1b(a, c, b, d) \
963 "movd " #a ", %%mm0 \n\t"\
964 "movd 4" #a ", %%mm1 \n\t"\
965 "punpckldq %%mm1, %%mm0 \n\t"\
966 "movd " #b ", %%mm4 \n\t"\
967 "movq %%mm0, %%mm1 \n\t"\
968 "movq %%mm0, %%mm2 \n\t"\
969 "psrlq $8, %%mm1 \n\t"\
970 "movd 4" #b ", %%mm3 \n\t"\
971 "psubusb %%mm1, %%mm2 \n\t"\
972 "psubusb %%mm0, %%mm1 \n\t"\
973 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
974 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
975 "punpckldq %%mm3, %%mm4 \n\t"\
976 "movq %%mm1, %%mm3 \n\t"\
977 "psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
978 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
979 "paddb %%mm6, %%mm0 \n\t"\
980 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
981 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
982 "movq %%mm4, %%mm3 \n\t"\
983 "paddb %%mm5, %%mm1 \n\t"\
984 "psubusb %%mm5, %%mm1 \n\t"\
985 "psrlq $8, %%mm3 \n\t"\
986 PAVGB(%%mm7, %%mm1)\
987 "pxor %%mm2, %%mm1 \n\t"\
988 "psubb %%mm2, %%mm1 \n\t"\
989 "movq %%mm4, %%mm2 \n\t"\
990 "psrlq $24, %%mm1 \n\t"\
991 "psubusb %%mm3, %%mm2 \n\t"\
992 "movd %%mm1, %%ecx \n\t"\
993 "psubusb %%mm4, %%mm3 \n\t"\
994 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
995 "por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\
996 "paddb %%mm6, %%mm0 \n\t"\
997 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
998 "movq %%mm3, %%mm1 \n\t"\
999 "psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\
1000 "movq %%mm0, " #a " \n\t"\
1001 PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
1002 "paddb %%mm6, %%mm4 \n\t"\
1003 "psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
1004 "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
1005 "paddb %%mm5, %%mm3 \n\t"\
1006 "psubusb %%mm5, %%mm3 \n\t"\
1007 PAVGB(%%mm7, %%mm3)\
1008 "pxor %%mm2, %%mm3 \n\t"\
1009 "psubb %%mm2, %%mm3 \n\t"\
1010 "psrlq $24, %%mm3 \n\t"\
1011 "movd " #c ", %%mm0 \n\t"\
1012 "movd 4" #c ", %%mm1 \n\t"\
1013 "punpckldq %%mm1, %%mm0 \n\t"\
1014 "paddb %%mm6, %%mm0 \n\t"\
1015 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
1016 "paddb %%mm6, %%mm0 \n\t"\
1017 "movq %%mm0, " #c " \n\t"\
1018 "movd %%mm3, %%ecx \n\t"\
1019 "movd " #d ", %%mm0 \n\t"\
1020 "paddsb (%2, %%ecx, 8), %%mm4 \n\t"\
1021 "movd 4" #d ", %%mm1 \n\t"\
1022 "paddb %%mm6, %%mm4 \n\t"\
1023 "punpckldq %%mm1, %%mm0 \n\t"\
1024 "movq %%mm4, " #b " \n\t"\
1025 "paddb %%mm6, %%mm0 \n\t"\
1026 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
1027 "paddb %%mm6, %%mm0 \n\t"\
1028 "movq %%mm0, " #d " \n\t"\
1029
1030 HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
1031 HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
1032
1033
1034 :
1035 : "r" (src), "r" (stride), "r" (lut)
1036 : "%eax", "%ebx", "%ecx"
1037 );
1038 #else
1039
1040 //FIXME (has little in common with the mmx2 version)
1041 for(y=0; y<BLOCK_SIZE; y++)
1042 {
1043 int a= src[1] - src[2];
1044 int b= src[3] - src[4];
1045 int c= src[5] - src[6];
1046
1047 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
1048
1049 if(d < QP)
1050 {
1051 int v = d * SIGN(-b);
1052
1053 src[1] +=v/8;
1054 src[2] +=v/4;
1055 src[3] +=3*v/8;
1056 src[4] -=3*v/8;
1057 src[5] -=v/4;
1058 src[6] -=v/8;
1059
1060 }
1061 src+=stride;
1062 }
1063 #endif
1064 }
1065
1066
1067 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) 862 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
1068 { 863 {
1069 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 864 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1070 /* 865 /*
1071 uint8_t tmp[16]; 866 uint8_t tmp[16];
1637 1432
1638 src[l4]-= d; 1433 src[l4]-= d;
1639 src[l5]+= d; 1434 src[l5]+= d;
1640 } 1435 }
1641 src++; 1436 src++;
1642 }
1643 #endif
1644 }
1645
1646 //FIXME? |255-0| = 1
1647 /**
1648 * Check if the given 8x8 Block is mostly "flat"
1649 */
1650 static inline int isHorizDC(uint8_t src[], int stride)
1651 {
1652 // src++;
1653 int numEq= 0;
1654 #if 0
1655 asm volatile (
1656 // "int $3 \n\t"
1657 "leal (%1, %2), %%ecx \n\t"
1658 "leal (%%ecx, %2, 4), %%ebx \n\t"
1659 // 0 1 2 3 4 5 6 7 8 9
1660 // %1 ecx ecx+%2 ecx+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
1661 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
1662 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
1663 "pxor %%mm0, %%mm0 \n\t"
1664 "movl %1, %%eax \n\t"
1665 "andl $0x1F, %%eax \n\t"
1666 "cmpl $24, %%eax \n\t"
1667 "leal tempBlock, %%eax \n\t"
1668 "jb 1f \n\t"
1669
1670 #define HDC_CHECK_AND_CPY(src, dst) \
1671 "movd " #src ", %%mm2 \n\t"\
1672 "punpckldq 4" #src ", %%mm2 \n\t" /* (%1) */\
1673 "movq %%mm2, %%mm1 \n\t"\
1674 "psrlq $8, %%mm2 \n\t"\
1675 "psubb %%mm1, %%mm2 \n\t"\
1676 "paddb %%mm7, %%mm2 \n\t"\
1677 "pcmpgtb %%mm6, %%mm2 \n\t"\
1678 "paddb %%mm2, %%mm0 \n\t"\
1679 "movq %%mm1," #dst "(%%eax) \n\t"
1680
1681 HDC_CHECK_AND_CPY((%1),0)
1682 HDC_CHECK_AND_CPY((%%ecx),8)
1683 HDC_CHECK_AND_CPY((%%ecx, %2),16)
1684 HDC_CHECK_AND_CPY((%%ecx, %2, 2),24)
1685 HDC_CHECK_AND_CPY((%1, %2, 4),32)
1686 HDC_CHECK_AND_CPY((%%ebx),40)
1687 HDC_CHECK_AND_CPY((%%ebx, %2),48)
1688 HDC_CHECK_AND_CPY((%%ebx, %2, 2),56)
1689 "jmp 2f \n\t"
1690 "1: \n\t"
1691 // src does not cross a 32 byte cache line so dont waste time with alignment
1692 #define HDC_CHECK_AND_CPY2(src, dst) \
1693 "movq " #src ", %%mm2 \n\t"\
1694 "movq " #src ", %%mm1 \n\t"\
1695 "psrlq $8, %%mm2 \n\t"\
1696 "psubb %%mm1, %%mm2 \n\t"\
1697 "paddb %%mm7, %%mm2 \n\t"\
1698 "pcmpgtb %%mm6, %%mm2 \n\t"\
1699 "paddb %%mm2, %%mm0 \n\t"\
1700 "movq %%mm1," #dst "(%%eax) \n\t"
1701
1702 HDC_CHECK_AND_CPY2((%1),0)
1703 HDC_CHECK_AND_CPY2((%%ecx),8)
1704 HDC_CHECK_AND_CPY2((%%ecx, %2),16)
1705 HDC_CHECK_AND_CPY2((%%ecx, %2, 2),24)
1706 HDC_CHECK_AND_CPY2((%1, %2, 4),32)
1707 HDC_CHECK_AND_CPY2((%%ebx),40)
1708 HDC_CHECK_AND_CPY2((%%ebx, %2),48)
1709 HDC_CHECK_AND_CPY2((%%ebx, %2, 2),56)
1710 "2: \n\t"
1711 "psllq $8, %%mm0 \n\t" // remove dummy value
1712 "movq %%mm0, %%mm1 \n\t"
1713 "psrlw $8, %%mm0 \n\t"
1714 "paddb %%mm1, %%mm0 \n\t"
1715 "movq %%mm0, %%mm1 \n\t"
1716 "psrlq $16, %%mm0 \n\t"
1717 "paddb %%mm1, %%mm0 \n\t"
1718 "movq %%mm0, %%mm1 \n\t"
1719 "psrlq $32, %%mm0 \n\t"
1720 "paddb %%mm1, %%mm0 \n\t"
1721 "movd %%mm0, %0 \n\t"
1722 : "=r" (numEq)
1723 : "r" (src), "r" (stride)
1724 : "%eax", "%ebx", "%ecx"
1725 );
1726 // printf("%d\n", numEq);
1727 numEq= (256 - numEq) &0xFF;
1728 #else
1729 int y;
1730 for(y=0; y<BLOCK_SIZE; y++)
1731 {
1732 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1733 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1734 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1735 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1736 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1737 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1738 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1739 src+= stride;
1740 }
1741 #endif
1742 /* if(abs(numEq - asmEq) > 0)
1743 {
1744 // printf("\nasm:%d c:%d\n", asmEq, numEq);
1745 for(int y=0; y<8; y++)
1746 {
1747 for(int x=0; x<8; x++)
1748 {
1749 printf("%d ", src[x + y*stride]);
1750 }
1751 printf("\n");
1752 }
1753 }
1754 */
1755 // printf("%d\n", numEq);
1756 return numEq > hFlatnessThreshold;
1757 }
1758
1759 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1760 {
1761 if(abs(src[0] - src[7]) > 2*QP) return 0;
1762
1763 return 1;
1764 }
1765
1766 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
1767 {
1768 #if 0
1769 asm volatile(
1770 "leal (%0, %1), %%ecx \n\t"
1771 "leal (%%ecx, %1, 4), %%ebx \n\t"
1772 // 0 1 2 3 4 5 6 7 8 9
1773 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1774 "pxor %%mm7, %%mm7 \n\t"
1775 "movq bm00001000, %%mm6 \n\t"
1776 "movd %2, %%mm5 \n\t" // QP
1777 "movq %%mm5, %%mm4 \n\t"
1778 "paddusb %%mm5, %%mm5 \n\t" // 2QP
1779 "paddusb %%mm5, %%mm4 \n\t" // 3QP
1780 "psllq $24, %%mm4 \n\t"
1781 "pxor %%mm5, %%mm5 \n\t" // 0
1782 "psubb %%mm4, %%mm5 \n\t" // -QP
1783 "leal tempBlock, %%eax \n\t"
1784
1785 //FIXME? "unroll by 2" and mix
1786 #ifdef HAVE_MMX2
1787 #define HDF(src, dst) \
1788 "movq " #src "(%%eax), %%mm0 \n\t"\
1789 "movq " #src "(%%eax), %%mm1 \n\t"\
1790 "movq " #src "(%%eax), %%mm2 \n\t"\
1791 "psrlq $8, %%mm1 \n\t"\
1792 "psubusb %%mm1, %%mm2 \n\t"\
1793 "psubusb %%mm0, %%mm1 \n\t"\
1794 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
1795 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
1796 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
1797 "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\
1798 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1799 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
1800 "paddb %%mm5, %%mm1 \n\t"\
1801 "psubusb %%mm5, %%mm1 \n\t"\
1802 "psrlw $2, %%mm1 \n\t"\
1803 "pxor %%mm2, %%mm1 \n\t"\
1804 "psubb %%mm2, %%mm1 \n\t"\
1805 "pand %%mm6, %%mm1 \n\t"\
1806 "psubb %%mm1, %%mm0 \n\t"\
1807 "psllq $8, %%mm1 \n\t"\
1808 "paddb %%mm1, %%mm0 \n\t"\
1809 "movd %%mm0, " #dst" \n\t"\
1810 "psrlq $32, %%mm0 \n\t"\
1811 "movd %%mm0, 4" #dst" \n\t"
1812 #else
1813 #define HDF(src, dst)\
1814 "movq " #src "(%%eax), %%mm0 \n\t"\
1815 "movq %%mm0, %%mm1 \n\t"\
1816 "movq %%mm0, %%mm2 \n\t"\
1817 "psrlq $8, %%mm1 \n\t"\
1818 "psubusb %%mm1, %%mm2 \n\t"\
1819 "psubusb %%mm0, %%mm1 \n\t"\
1820 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
1821 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
1822 "movq %%mm1, %%mm3 \n\t"\
1823 "psllq $32, %%mm3 \n\t"\
1824 "movq %%mm3, %%mm4 \n\t"\
1825 "psubusb %%mm1, %%mm4 \n\t"\
1826 "psubb %%mm4, %%mm3 \n\t"\
1827 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1828 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\
1829 "paddb %%mm5, %%mm1 \n\t"\
1830 "psubusb %%mm5, %%mm1 \n\t"\
1831 "psrlw $2, %%mm1 \n\t"\
1832 "pxor %%mm2, %%mm1 \n\t"\
1833 "psubb %%mm2, %%mm1 \n\t"\
1834 "pand %%mm6, %%mm1 \n\t"\
1835 "psubb %%mm1, %%mm0 \n\t"\
1836 "psllq $8, %%mm1 \n\t"\
1837 "paddb %%mm1, %%mm0 \n\t"\
1838 "movd %%mm0, " #dst " \n\t"\
1839 "psrlq $32, %%mm0 \n\t"\
1840 "movd %%mm0, 4" #dst " \n\t"
1841 #endif
1842 HDF(0,(%0))
1843 HDF(8,(%%ecx))
1844 HDF(16,(%%ecx, %1))
1845 HDF(24,(%%ecx, %1, 2))
1846 HDF(32,(%0, %1, 4))
1847 HDF(40,(%%ebx))
1848 HDF(48,(%%ebx, %1))
1849 HDF(56,(%%ebx, %1, 2))
1850 :
1851 : "r" (dst), "r" (stride), "r" (QP)
1852 : "%eax", "%ebx", "%ecx"
1853 );
1854 #else
1855 int y;
1856 for(y=0; y<BLOCK_SIZE; y++)
1857 {
1858 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
1859
1860 if(ABS(middleEnergy) < 8*QP)
1861 {
1862 const int q=(dst[3] - dst[4])/2;
1863 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
1864 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
1865
1866 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1867 d= MAX(d, 0);
1868
1869 d= (5*d + 32) >> 6;
1870 d*= SIGN(-middleEnergy);
1871
1872 if(q>0)
1873 {
1874 d= d<0 ? 0 : d;
1875 d= d>q ? q : d;
1876 }
1877 else
1878 {
1879 d= d>0 ? 0 : d;
1880 d= d<q ? q : d;
1881 }
1882
1883 dst[3]-= d;
1884 dst[4]+= d;
1885 }
1886 dst+= stride;
1887 }
1888 #endif
1889 }
1890
1891 /**
1892 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1893 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1894 * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
1895 */
1896 static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
1897 {
1898
1899 #if 0
1900 asm volatile(
1901 "leal (%0, %1), %%ecx \n\t"
1902 "leal (%%ecx, %1, 4), %%ebx \n\t"
1903 // 0 1 2 3 4 5 6 7 8 9
1904 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1905 "pxor %%mm7, %%mm7 \n\t"
1906 "leal tempBlock, %%eax \n\t"
1907 /*
1908 #define HLP1 "movq (%0), %%mm0 \n\t"\
1909 "movq %%mm0, %%mm1 \n\t"\
1910 "psllq $8, %%mm0 \n\t"\
1911 PAVGB(%%mm1, %%mm0)\
1912 "psrlw $8, %%mm0 \n\t"\
1913 "pxor %%mm1, %%mm1 \n\t"\
1914 "packuswb %%mm1, %%mm0 \n\t"\
1915 "movq %%mm0, %%mm1 \n\t"\
1916 "movq %%mm0, %%mm2 \n\t"\
1917 "psllq $32, %%mm0 \n\t"\
1918 "paddb %%mm0, %%mm1 \n\t"\
1919 "psllq $16, %%mm2 \n\t"\
1920 PAVGB(%%mm2, %%mm0)\
1921 "movq %%mm0, %%mm3 \n\t"\
1922 "pand bm11001100, %%mm0 \n\t"\
1923 "paddusb %%mm0, %%mm3 \n\t"\
1924 "psrlq $8, %%mm3 \n\t"\
1925 PAVGB(%%mm1, %%mm4)\
1926 PAVGB(%%mm3, %%mm2)\
1927 "psrlq $16, %%mm2 \n\t"\
1928 "punpcklbw %%mm2, %%mm2 \n\t"\
1929 "movq %%mm2, (%0) \n\t"\
1930
1931 #define HLP2 "movq (%0), %%mm0 \n\t"\
1932 "movq %%mm0, %%mm1 \n\t"\
1933 "psllq $8, %%mm0 \n\t"\
1934 PAVGB(%%mm1, %%mm0)\
1935 "psrlw $8, %%mm0 \n\t"\
1936 "pxor %%mm1, %%mm1 \n\t"\
1937 "packuswb %%mm1, %%mm0 \n\t"\
1938 "movq %%mm0, %%mm2 \n\t"\
1939 "psllq $32, %%mm0 \n\t"\
1940 "psllq $16, %%mm2 \n\t"\
1941 PAVGB(%%mm2, %%mm0)\
1942 "movq %%mm0, %%mm3 \n\t"\
1943 "pand bm11001100, %%mm0 \n\t"\
1944 "paddusb %%mm0, %%mm3 \n\t"\
1945 "psrlq $8, %%mm3 \n\t"\
1946 PAVGB(%%mm3, %%mm2)\
1947 "psrlq $16, %%mm2 \n\t"\
1948 "punpcklbw %%mm2, %%mm2 \n\t"\
1949 "movq %%mm2, (%0) \n\t"\
1950 */
1951 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
1952 /*
1953 Implemented Exact 7-Tap
1954 9421 A321
1955 36421 64321
1956 334321 =
1957 1234321 =
1958 1234321 =
1959 123433 =
1960 12463 12346
1961 1249 123A
1962
1963 */
1964
1965 #ifdef HAVE_MMX2
1966 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1967 "movq %%mm0, %%mm1 \n\t"\
1968 "movq %%mm0, %%mm2 \n\t"\
1969 "movq %%mm0, %%mm3 \n\t"\
1970 "movq %%mm0, %%mm4 \n\t"\
1971 "psllq $8, %%mm1 \n\t"\
1972 "psrlq $8, %%mm2 \n\t"\
1973 "pand bm00000001, %%mm3 \n\t"\
1974 "pand bm10000000, %%mm4 \n\t"\
1975 "por %%mm3, %%mm1 \n\t"\
1976 "por %%mm4, %%mm2 \n\t"\
1977 PAVGB(%%mm2, %%mm1)\
1978 PAVGB(%%mm1, %%mm0)\
1979 \
1980 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\
1981 "pshufw $0x90, %%mm0, %%mm4 \n\t"\
1982 PAVGB(%%mm3, %%mm4)\
1983 PAVGB(%%mm4, %%mm0)\
1984 "movd %%mm0, (%0) \n\t"\
1985 "psrlq $32, %%mm0 \n\t"\
1986 "movd %%mm0, 4(%0) \n\t"
1987 #else
1988 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1989 "movq %%mm0, %%mm1 \n\t"\
1990 "movq %%mm0, %%mm2 \n\t"\
1991 "movd -4(%0), %%mm3 \n\t" /*0001000*/\
1992 "movd 8(%0), %%mm4 \n\t" /*0001000*/\
1993 "psllq $8, %%mm1 \n\t"\
1994 "psrlq $8, %%mm2 \n\t"\
1995 "psrlq $24, %%mm3 \n\t"\
1996 "psllq $56, %%mm4 \n\t"\
1997 "por %%mm3, %%mm1 \n\t"\
1998 "por %%mm4, %%mm2 \n\t"\
1999 PAVGB(%%mm2, %%mm1)\
2000 PAVGB(%%mm1, %%mm0)\
2001 \
2002 "movq %%mm0, %%mm3 \n\t"\
2003 "movq %%mm0, %%mm4 \n\t"\
2004 "movq %%mm0, %%mm5 \n\t"\
2005 "psrlq $16, %%mm3 \n\t"\
2006 "psllq $16, %%mm4 \n\t"\
2007 "pand bm11000000, %%mm5 \n\t"\
2008 "por %%mm5, %%mm3 \n\t"\
2009 "movq %%mm0, %%mm5 \n\t"\
2010 "pand bm00000011, %%mm5 \n\t"\
2011 "por %%mm5, %%mm4 \n\t"\
2012 PAVGB(%%mm3, %%mm4)\
2013 PAVGB(%%mm4, %%mm0)\
2014 "movd %%mm0, (%0) \n\t"\
2015 "psrlq $32, %%mm0 \n\t"\
2016 "movd %%mm0, 4(%0) \n\t"
2017 #endif
2018
2019 /* uses the 7-Tap Filter: 1112111 */
2020 #define NEW_HLP(src, dst)\
2021 "movq " #src "(%%eax), %%mm1 \n\t"\
2022 "movq " #src "(%%eax), %%mm2 \n\t"\
2023 "psllq $8, %%mm1 \n\t"\
2024 "psrlq $8, %%mm2 \n\t"\
2025 "movd -4" #dst ", %%mm3 \n\t" /*0001000*/\
2026 "movd 8" #dst ", %%mm4 \n\t" /*0001000*/\
2027 "psrlq $24, %%mm3 \n\t"\
2028 "psllq $56, %%mm4 \n\t"\
2029 "por %%mm3, %%mm1 \n\t"\
2030 "por %%mm4, %%mm2 \n\t"\
2031 "movq %%mm1, %%mm5 \n\t"\
2032 PAVGB(%%mm2, %%mm1)\
2033 "movq " #src "(%%eax), %%mm0 \n\t"\
2034 PAVGB(%%mm1, %%mm0)\
2035 "psllq $8, %%mm5 \n\t"\
2036 "psrlq $8, %%mm2 \n\t"\
2037 "por %%mm3, %%mm5 \n\t"\
2038 "por %%mm4, %%mm2 \n\t"\
2039 "movq %%mm5, %%mm1 \n\t"\
2040 PAVGB(%%mm2, %%mm5)\
2041 "psllq $8, %%mm1 \n\t"\
2042 "psrlq $8, %%mm2 \n\t"\
2043 "por %%mm3, %%mm1 \n\t"\
2044 "por %%mm4, %%mm2 \n\t"\
2045 PAVGB(%%mm2, %%mm1)\
2046 PAVGB(%%mm1, %%mm5)\
2047 PAVGB(%%mm5, %%mm0)\
2048 "movd %%mm0, " #dst " \n\t"\
2049 "psrlq $32, %%mm0 \n\t"\
2050 "movd %%mm0, 4" #dst " \n\t"
2051
2052 /* uses the 9-Tap Filter: 112242211 */
2053 #define NEW_HLP2(i)\
2054 "movq " #i "(%%eax), %%mm0 \n\t" /*0001000*/\
2055 "movq %%mm0, %%mm1 \n\t" /*0001000*/\
2056 "movq %%mm0, %%mm2 \n\t" /*0001000*/\
2057 "movd -4(%0), %%mm3 \n\t" /*0001000*/\
2058 "movd 8(%0), %%mm4 \n\t" /*0001000*/\
2059 "psllq $8, %%mm1 \n\t"\
2060 "psrlq $8, %%mm2 \n\t"\
2061 "psrlq $24, %%mm3 \n\t"\
2062 "psllq $56, %%mm4 \n\t"\
2063 "por %%mm3, %%mm1 \n\t" /*0010000*/\
2064 "por %%mm4, %%mm2 \n\t" /*0000100*/\
2065 "movq %%mm1, %%mm5 \n\t" /*0010000*/\
2066 PAVGB(%%mm2, %%mm1) /*0010100*/\
2067 PAVGB(%%mm1, %%mm0) /*0012100*/\
2068 "psllq $8, %%mm5 \n\t"\
2069 "psrlq $8, %%mm2 \n\t"\
2070 "por %%mm3, %%mm5 \n\t" /*0100000*/\
2071 "por %%mm4, %%mm2 \n\t" /*0000010*/\
2072 "movq %%mm5, %%mm1 \n\t" /*0100000*/\
2073 PAVGB(%%mm2, %%mm5) /*0100010*/\
2074 "psllq $8, %%mm1 \n\t"\
2075 "psrlq $8, %%mm2 \n\t"\
2076 "por %%mm3, %%mm1 \n\t" /*1000000*/\
2077 "por %%mm4, %%mm2 \n\t" /*0000001*/\
2078 "movq %%mm1, %%mm6 \n\t" /*1000000*/\
2079 PAVGB(%%mm2, %%mm1) /*1000001*/\
2080 "psllq $8, %%mm6 \n\t"\
2081 "psrlq $8, %%mm2 \n\t"\
2082 "por %%mm3, %%mm6 \n\t"/*100000000*/\
2083 "por %%mm4, %%mm2 \n\t"/*000000001*/\
2084 PAVGB(%%mm2, %%mm6) /*100000001*/\
2085 PAVGB(%%mm6, %%mm1) /*110000011*/\
2086 PAVGB(%%mm1, %%mm5) /*112000211*/\
2087 PAVGB(%%mm5, %%mm0) /*112242211*/\
2088 "movd %%mm0, (%0) \n\t"\
2089 "psrlq $32, %%mm0 \n\t"\
2090 "movd %%mm0, 4(%0) \n\t"
2091
2092 #define HLP(src, dst) NEW_HLP(src, dst)
2093
2094 HLP(0, (%0))
2095 HLP(8, (%%ecx))
2096 HLP(16, (%%ecx, %1))
2097 HLP(24, (%%ecx, %1, 2))
2098 HLP(32, (%0, %1, 4))
2099 HLP(40, (%%ebx))
2100 HLP(48, (%%ebx, %1))
2101 HLP(56, (%%ebx, %1, 2))
2102
2103 :
2104 : "r" (dst), "r" (stride)
2105 : "%eax", "%ebx", "%ecx"
2106 );
2107
2108 #else
2109 int y;
2110 for(y=0; y<BLOCK_SIZE; y++)
2111 {
2112 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
2113 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
2114
2115 int sums[9];
2116 sums[0] = first + dst[0];
2117 sums[1] = dst[0] + dst[1];
2118 sums[2] = dst[1] + dst[2];
2119 sums[3] = dst[2] + dst[3];
2120 sums[4] = dst[3] + dst[4];
2121 sums[5] = dst[4] + dst[5];
2122 sums[6] = dst[5] + dst[6];
2123 sums[7] = dst[6] + dst[7];
2124 sums[8] = dst[7] + last;
2125
2126 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
2127 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
2128 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
2129 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
2130 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
2131 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
2132 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
2133 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
2134
2135 dst+= stride;
2136 } 1437 }
2137 #endif 1438 #endif
2138 } 1439 }
2139 1440
2140 static inline void dering(uint8_t src[], int stride, int QP) 1441 static inline void dering(uint8_t src[], int stride, int QP)
3531 2832
3532 horizontal_size >>= 1; 2833 horizontal_size >>= 1;
3533 vertical_size >>= 1; 2834 vertical_size >>= 1;
3534 src_stride >>= 1; 2835 src_stride >>= 1;
3535 dst_stride >>= 1; 2836 dst_stride >>= 1;
3536 // mode&= ~(LINEAR_IPOL_DEINT_FILTER | LINEAR_BLEND_DEINT_FILTER |
3537 // MEDIAN_DEINT_FILTER | CUBIC_IPOL_DEINT_FILTER);
3538 2837
3539 if(1) 2838 if(1)
3540 { 2839 {
3541 postProcess(src[1], src_stride, dst[1], dst_stride, 2840 postProcess(src[1], src_stride, dst[1], dst_stride,
3542 horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode); 2841 horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode);
3636 * Copies a block from src to dst and fixes the blacklevel 2935 * Copies a block from src to dst and fixes the blacklevel
3637 * numLines must be a multiple of 4 2936 * numLines must be a multiple of 4
3638 * levelFix == 0 -> dont touch the brighness & contrast 2937 * levelFix == 0 -> dont touch the brighness & contrast
3639 */ 2938 */
3640 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, 2939 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
3641 int numLines, int levelFix) 2940 int levelFix)
3642 { 2941 {
3643 #ifndef HAVE_MMX 2942 #ifndef HAVE_MMX
3644 int i; 2943 int i;
3645 #endif 2944 #endif
3646 if(levelFix) 2945 if(levelFix)
3693 :"r" (srcStride), 2992 :"r" (srcStride),
3694 "r" (dstStride) 2993 "r" (dstStride)
3695 : "%eax", "%ebx" 2994 : "%eax", "%ebx"
3696 ); 2995 );
3697 #else 2996 #else
3698 for(i=0; i<numLines; i++) 2997 for(i=0; i<8; i++)
3699 memcpy( &(dst[dstStride*i]), 2998 memcpy( &(dst[dstStride*i]),
3700 &(src[srcStride*i]), BLOCK_SIZE); 2999 &(src[srcStride*i]), BLOCK_SIZE);
3701 #endif 3000 #endif
3702 } 3001 }
3703 else 3002 else
3704 { 3003 {
3705 #ifdef HAVE_MMX 3004 #ifdef HAVE_MMX
3706 asm volatile( 3005 asm volatile(
3707 "movl %4, %%eax \n\t"
3708 "movl %%eax, temp0\n\t"
3709 "pushl %0 \n\t" 3006 "pushl %0 \n\t"
3710 "pushl %1 \n\t" 3007 "pushl %1 \n\t"
3711 "leal (%2,%2), %%eax \n\t" 3008 "leal (%2,%2), %%eax \n\t"
3712 "leal (%3,%3), %%ebx \n\t" 3009 "leal (%3,%3), %%ebx \n\t"
3713 "movq packedYOffset, %%mm2 \n\t"
3714 "movq packedYScale, %%mm3 \n\t"
3715 3010
3716 #define SIMPLE_CPY \ 3011 #define SIMPLE_CPY \
3717 "movq (%0), %%mm0 \n\t"\ 3012 "movq (%0), %%mm0 \n\t"\
3718 "movq (%0,%2), %%mm1 \n\t"\ 3013 "movq (%0,%2), %%mm1 \n\t"\
3719 "movq %%mm0, (%1) \n\t"\ 3014 "movq %%mm0, (%1) \n\t"\
3720 "movq %%mm1, (%1, %3) \n\t"\ 3015 "movq %%mm1, (%1, %3) \n\t"\
3721 3016
3722 "1: \n\t"
3723 SIMPLE_CPY 3017 SIMPLE_CPY
3724 "addl %%eax, %0 \n\t" 3018 "addl %%eax, %0 \n\t"
3725 "addl %%ebx, %1 \n\t" 3019 "addl %%ebx, %1 \n\t"
3726 SIMPLE_CPY 3020 SIMPLE_CPY
3727 "addl %%eax, %0 \n\t" 3021 "addl %%eax, %0 \n\t"
3728 "addl %%ebx, %1 \n\t" 3022 "addl %%ebx, %1 \n\t"
3729 "decl temp0 \n\t" 3023 SIMPLE_CPY
3730 "jnz 1b \n\t" 3024 "addl %%eax, %0 \n\t"
3025 "addl %%ebx, %1 \n\t"
3026 SIMPLE_CPY
3731 3027
3732 "popl %1 \n\t" 3028 "popl %1 \n\t"
3733 "popl %0 \n\t" 3029 "popl %0 \n\t"
3734 : : "r" (src), 3030 : : "r" (src),
3735 "r" (dst), 3031 "r" (dst),
3736 "r" (srcStride), 3032 "r" (srcStride),
3737 "r" (dstStride), 3033 "r" (dstStride)
3738 "m" (numLines>>2)
3739 : "%eax", "%ebx" 3034 : "%eax", "%ebx"
3740 ); 3035 );
3741 #else 3036 #else
3742 for(i=0; i<numLines; i++) 3037 for(i=0; i<8; i++)
3743 memcpy( &(dst[dstStride*i]), 3038 memcpy( &(dst[dstStride*i]),
3744 &(src[srcStride*i]), BLOCK_SIZE); 3039 &(src[srcStride*i]), BLOCK_SIZE);
3745 #endif 3040 #endif
3746 } 3041 }
3747 } 3042 }
3772 3067
3773 /* Temporal noise reducing buffers */ 3068 /* Temporal noise reducing buffers */
3774 static uint8_t *tempBlured[3]= {NULL,NULL,NULL}; 3069 static uint8_t *tempBlured[3]= {NULL,NULL,NULL};
3775 static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL}; 3070 static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL};
3776 3071
3072 int copyAhead;
3073
3777 #ifdef PP_FUNNY_STRIDE 3074 #ifdef PP_FUNNY_STRIDE
3778 uint8_t *dstBlockPtrBackup; 3075 uint8_t *dstBlockPtrBackup;
3779 uint8_t *srcBlockPtrBackup; 3076 uint8_t *srcBlockPtrBackup;
3780 #endif 3077 #endif
3781 3078
3790 #ifdef HAVE_MMX 3087 #ifdef HAVE_MMX
3791 maxTmpNoise[0]= ppMode->maxTmpNoise[0]; 3088 maxTmpNoise[0]= ppMode->maxTmpNoise[0];
3792 maxTmpNoise[1]= ppMode->maxTmpNoise[1]; 3089 maxTmpNoise[1]= ppMode->maxTmpNoise[1];
3793 maxTmpNoise[2]= ppMode->maxTmpNoise[2]; 3090 maxTmpNoise[2]= ppMode->maxTmpNoise[2];
3794 #endif 3091 #endif
3092
3093 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
3094 else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14;
3095 else if( (mode & V_DEBLOCK)
3096 || (mode & LINEAR_IPOL_DEINT_FILTER)
3097 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
3098 else if(mode & V_X1_FILTER) copyAhead=11;
3099 else if(mode & V_RK1_FILTER) copyAhead=10;
3100 else if(mode & DERING) copyAhead=9;
3101 else copyAhead=8;
3102
3103 copyAhead-= 8;
3795 3104
3796 if(tempDst==NULL) 3105 if(tempDst==NULL)
3797 { 3106 {
3798 tempDst= (uint8_t*)memalign(8, 1024*24); 3107 tempDst= (uint8_t*)memalign(8, 1024*24);
3799 tempSrc= (uint8_t*)memalign(8, 1024*24); 3108 tempSrc= (uint8_t*)memalign(8, 1024*24);
3896 for(x=0; x<width; x+=BLOCK_SIZE) 3205 for(x=0; x<width; x+=BLOCK_SIZE)
3897 { 3206 {
3898 3207
3899 #ifdef HAVE_MMX2 3208 #ifdef HAVE_MMX2
3900 /* 3209 /*
3901 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3902 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3903 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3904 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3905 */
3906 /*
3907 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); 3210 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3908 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); 3211 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3909 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); 3212 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3910 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); 3213 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3911 */ 3214 */
3912 3215
3913 asm( 3216 asm(
3914 "movl %4, %%eax \n\t" 3217 "movl %4, %%eax \n\t"
3915 "shrl $2, %%eax \n\t" 3218 "shrl $2, %%eax \n\t"
3916 "andl $6, %%eax \n\t" 3219 "andl $6, %%eax \n\t"
3917 "addl $8, %%eax \n\t" 3220 "addl %5, %%eax \n\t"
3918 "movl %%eax, %%ebx \n\t" 3221 "movl %%eax, %%ebx \n\t"
3919 "imul %1, %%eax \n\t" 3222 "imul %1, %%eax \n\t"
3920 "imul %3, %%ebx \n\t" 3223 "imul %3, %%ebx \n\t"
3921 "prefetchnta 32(%%eax, %0) \n\t" 3224 "prefetchnta 32(%%eax, %0) \n\t"
3922 "prefetcht0 32(%%ebx, %2) \n\t" 3225 "prefetcht0 32(%%ebx, %2) \n\t"
3923 "addl %1, %%eax \n\t" 3226 "addl %1, %%eax \n\t"
3924 "addl %3, %%ebx \n\t" 3227 "addl %3, %%ebx \n\t"
3925 "prefetchnta 32(%%eax, %0) \n\t" 3228 "prefetchnta 32(%%eax, %0) \n\t"
3926 "prefetcht0 32(%%ebx, %2) \n\t" 3229 "prefetcht0 32(%%ebx, %2) \n\t"
3927 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), 3230 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3928 "m" (x) 3231 "m" (x), "m" (copyAhead)
3929 : "%eax", "%ebx" 3232 : "%eax", "%ebx"
3930 ); 3233 );
3931 3234
3932 #elif defined(HAVE_3DNOW) 3235 #elif defined(HAVE_3DNOW)
3933 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... 3236 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3936 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 3239 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3937 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 3240 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3938 */ 3241 */
3939 #endif 3242 #endif
3940 3243
3941 blockCopy(dstBlock + dstStride*8, dstStride, 3244 blockCopy(dstBlock + dstStride*copyAhead, dstStride,
3942 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX); 3245 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
3943 3246
3944 if(mode & LINEAR_IPOL_DEINT_FILTER) 3247 if(mode & LINEAR_IPOL_DEINT_FILTER)
3945 deInterlaceInterpolateLinear(dstBlock, dstStride); 3248 deInterlaceInterpolateLinear(dstBlock, dstStride);
3946 else if(mode & LINEAR_BLEND_DEINT_FILTER) 3249 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3947 deInterlaceBlendLinear(dstBlock, dstStride); 3250 deInterlaceBlendLinear(dstBlock, dstStride);
3953 deInterlaceBlendCubic(dstBlock, dstStride); 3256 deInterlaceBlendCubic(dstBlock, dstStride);
3954 */ 3257 */
3955 dstBlock+=8; 3258 dstBlock+=8;
3956 srcBlock+=8; 3259 srcBlock+=8;
3957 } 3260 }
3958 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, 8*dstStride ); 3261 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride );
3959 } 3262 }
3960 3263
3961 for(y=0; y<height; y+=BLOCK_SIZE) 3264 for(y=0; y<height; y+=BLOCK_SIZE)
3962 { 3265 {
3963 //1% speedup if these are here instead of the inner loop 3266 //1% speedup if these are here instead of the inner loop
3974 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards 3277 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3975 if not than use a temporary buffer */ 3278 if not than use a temporary buffer */
3976 if(y+15 >= height) 3279 if(y+15 >= height)
3977 { 3280 {
3978 int i; 3281 int i;
3979 /* copy from line 8 to 15 of src, these will be copied with 3282 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3980 blockcopy to dst later */ 3283 blockcopy to dst later */
3981 memcpy(tempSrc + srcStride*8, srcBlock + srcStride*8, 3284 memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3982 srcStride*MAX(height-y-8, 0) ); 3285 srcStride*MAX(height-y-copyAhead, 0) );
3983 3286
3984 /* duplicate last line of src to fill the void upto line 15 */ 3287 /* duplicate last line of src to fill the void upto line (copyAhead+7) */
3985 for(i=MAX(height-y, 8); i<=15; i++) 3288 for(i=MAX(height-y, 8); i<copyAhead+8; i++)
3986 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride); 3289 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
3987 3290
3988 /* copy up to 9 lines of dst (line -1 to 7)*/ 3291 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3989 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 9) ); 3292 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
3990 3293
3991 /* duplicate last line of dst to fill the void upto line 8 */ 3294 /* duplicate last line of dst to fill the void upto line (copyAhead) */
3992 for(i=height-y+1; i<=8; i++) 3295 for(i=height-y+1; i<=copyAhead; i++)
3993 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride); 3296 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
3994 3297
3995 dstBlock= tempDst + dstStride; 3298 dstBlock= tempDst + dstStride;
3996 srcBlock= tempSrc; 3299 srcBlock= tempSrc;
3997 } 3300 }
4039 T0= rdtsc(); 3342 T0= rdtsc();
4040 #endif 3343 #endif
4041 3344
4042 #ifdef HAVE_MMX2 3345 #ifdef HAVE_MMX2
4043 /* 3346 /*
4044 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
4045 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
4046 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
4047 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
4048 */
4049 /*
4050 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); 3347 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
4051 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); 3348 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
4052 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); 3349 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
4053 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); 3350 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
4054 */ 3351 */
4055 3352
4056 asm( 3353 asm(
4057 "movl %4, %%eax \n\t" 3354 "movl %4, %%eax \n\t"
4058 "shrl $2, %%eax \n\t" 3355 "shrl $2, %%eax \n\t"
4059 "andl $6, %%eax \n\t" 3356 "andl $6, %%eax \n\t"
4060 "addl $8, %%eax \n\t" 3357 "addl %5, %%eax \n\t"
4061 "movl %%eax, %%ebx \n\t" 3358 "movl %%eax, %%ebx \n\t"
4062 "imul %1, %%eax \n\t" 3359 "imul %1, %%eax \n\t"
4063 "imul %3, %%ebx \n\t" 3360 "imul %3, %%ebx \n\t"
4064 "prefetchnta 32(%%eax, %0) \n\t" 3361 "prefetchnta 32(%%eax, %0) \n\t"
4065 "prefetcht0 32(%%ebx, %2) \n\t" 3362 "prefetcht0 32(%%ebx, %2) \n\t"
4066 "addl %1, %%eax \n\t" 3363 "addl %1, %%eax \n\t"
4067 "addl %3, %%ebx \n\t" 3364 "addl %3, %%ebx \n\t"
4068 "prefetchnta 32(%%eax, %0) \n\t" 3365 "prefetchnta 32(%%eax, %0) \n\t"
4069 "prefetcht0 32(%%ebx, %2) \n\t" 3366 "prefetcht0 32(%%ebx, %2) \n\t"
4070 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), 3367 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
4071 "m" (x) 3368 "m" (x), "m" (copyAhead)
4072 : "%eax", "%ebx" 3369 : "%eax", "%ebx"
4073 ); 3370 );
4074 3371
4075 #elif defined(HAVE_3DNOW) 3372 #elif defined(HAVE_3DNOW)
4076 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... 3373 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
4098 dstBlock= tempDstBlock; 3395 dstBlock= tempDstBlock;
4099 srcBlock= tempSrcBlock; 3396 srcBlock= tempSrcBlock;
4100 } 3397 }
4101 #endif 3398 #endif
4102 3399
4103 blockCopy(dstBlock + dstStride*8, dstStride, 3400 blockCopy(dstBlock + dstStride*copyAhead, dstStride,
4104 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX); 3401 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
4105 3402
4106 if(mode & LINEAR_IPOL_DEINT_FILTER) 3403 if(mode & LINEAR_IPOL_DEINT_FILTER)
4107 deInterlaceInterpolateLinear(dstBlock, dstStride); 3404 deInterlaceInterpolateLinear(dstBlock, dstStride);
4108 else if(mode & LINEAR_BLEND_DEINT_FILTER) 3405 else if(mode & LINEAR_BLEND_DEINT_FILTER)
4109 deInterlaceBlendLinear(dstBlock, dstStride); 3406 deInterlaceBlendLinear(dstBlock, dstStride);
4158 vertRK1Filter(tempBlock1, 16, QP); 3455 vertRK1Filter(tempBlock1, 16, QP);
4159 else if(mode & H_X1_FILTER) 3456 else if(mode & H_X1_FILTER)
4160 vertX1Filter(tempBlock1, 16, QP); 3457 vertX1Filter(tempBlock1, 16, QP);
4161 else if(mode & H_DEBLOCK) 3458 else if(mode & H_DEBLOCK)
4162 { 3459 {
4163 if( isVertDC(tempBlock1, 16)) 3460 if( isVertDC(tempBlock1, 16) )
4164 { 3461 {
4165 if(isVertMinMaxOk(tempBlock1, 16, QP)) 3462 if(isVertMinMaxOk(tempBlock1, 16, QP))
4166 doVertLowPass(tempBlock1, 16, QP); 3463 doVertLowPass(tempBlock1, 16, QP);
4167 } 3464 }
4168 else 3465 else
4250 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) ); 3547 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
4251 } 3548 }
4252 /* 3549 /*
4253 for(x=0; x<width; x+=32) 3550 for(x=0; x<width; x+=32)
4254 { 3551 {
4255 int i; 3552 volatile int i;
4256 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] 3553 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
4257 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] 3554 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
4258 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride] 3555 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
4259 + dstBlock[x +13*dstStride] + dstBlock[x +14*dstStride] 3556 // + dstBlock[x +13*dstStride]
4260 + dstBlock[x +15*dstStride]; 3557 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
4261 } 3558 }*/
4262 */ } 3559 }
4263 #ifdef HAVE_3DNOW 3560 #ifdef HAVE_3DNOW
4264 asm volatile("femms"); 3561 asm volatile("femms");
4265 #elif defined (HAVE_MMX) 3562 #elif defined (HAVE_MMX)
4266 asm volatile("emms"); 3563 asm volatile("emms");
4267 #endif 3564 #endif