comparison postproc/postprocess.c @ 2473:94a0265c408c

dering in mmx2
author michael
date Thu, 25 Oct 2001 21:55:11 +0000
parents 60f16575bece
children 3369845d92f4
comparison
equal deleted inserted replaced
2472:487f5bbb38ae 2473:94a0265c408c
24 doVertDefFilter Ec Ec Ec 24 doVertDefFilter Ec Ec Ec
25 isHorizDC Ec Ec 25 isHorizDC Ec Ec
26 isHorizMinMaxOk a E 26 isHorizMinMaxOk a E
27 doHorizLowPass E e e 27 doHorizLowPass E e e
28 doHorizDefFilter Ec Ec Ec 28 doHorizDefFilter Ec Ec Ec
29 deRing 29 deRing e
30 Vertical RKAlgo1 E a a 30 Vertical RKAlgo1 E a a
31 Horizontal RKAlgo1 a a 31 Horizontal RKAlgo1 a a
32 Vertical X1 a E E 32 Vertical X1 a E E
33 Horizontal X1 a E E 33 Horizontal X1 a E E
34 LinIpolDeinterlace e E E* 34 LinIpolDeinterlace e E E*
63 noise reduction filters 63 noise reduction filters
64 border remover 64 border remover
65 ... 65 ...
66 66
67 Notes: 67 Notes:
68 fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
69 */ 68 */
70 69
71 //Changelog: use the CVS log 70 //Changelog: use the CVS log
72 71
73 #include <inttypes.h> 72 #include <inttypes.h>
114 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL; 113 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
115 static uint64_t b00= 0x0000000000000000LL; 114 static uint64_t b00= 0x0000000000000000LL;
116 static uint64_t b01= 0x0101010101010101LL; 115 static uint64_t b01= 0x0101010101010101LL;
117 static uint64_t b02= 0x0202020202020202LL; 116 static uint64_t b02= 0x0202020202020202LL;
118 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL; 117 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
118 static uint64_t b04= 0x0404040404040404LL;
119 static uint64_t b08= 0x0808080808080808LL;
119 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL; 120 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
120 static uint64_t b20= 0x2020202020202020LL; 121 static uint64_t b20= 0x2020202020202020LL;
121 static uint64_t b80= 0x8080808080808080LL; 122 static uint64_t b80= 0x8080808080808080LL;
122 static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL; 123 static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL;
123 static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL; 124 static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL;
127 static uint64_t temp2=0; 128 static uint64_t temp2=0;
128 static uint64_t temp3=0; 129 static uint64_t temp3=0;
129 static uint64_t temp4=0; 130 static uint64_t temp4=0;
130 static uint64_t temp5=0; 131 static uint64_t temp5=0;
131 static uint64_t pQPb=0; 132 static uint64_t pQPb=0;
133 static uint64_t pQPb2=0;
132 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code 134 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
133 135
134 int hFlatnessThreshold= 56 - 16; 136 int hFlatnessThreshold= 56 - 16;
135 int vFlatnessThreshold= 56 - 16; 137 int vFlatnessThreshold= 56 - 16;
136 138
1804 #endif 1806 #endif
1805 } 1807 }
1806 1808
1807 static inline void dering(uint8_t src[], int stride, int QP) 1809 static inline void dering(uint8_t src[], int stride, int QP)
1808 { 1810 {
1809 //FIXME 1811 #ifdef HAVE_MMX2
1810
1811 #ifdef HAVE_MMX2X
1812 asm volatile( 1812 asm volatile(
1813 "movq pQPb, %%mm0 \n\t"
1814 "paddusb %%mm0, %%mm0 \n\t"
1815 "movq %%mm0, pQPb2 \n\t"
1816
1813 "leal (%0, %1), %%eax \n\t" 1817 "leal (%0, %1), %%eax \n\t"
1814 "leal (%%eax, %1, 4), %%ebx \n\t" 1818 "leal (%%eax, %1, 4), %%ebx \n\t"
1815 // 0 1 2 3 4 5 6 7 8 9 1819 // 0 1 2 3 4 5 6 7 8 9
1816 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 1820 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1817 1821
1818 "pcmpeq %%mm6, %%mm6 \n\t" 1822 "pcmpeqb %%mm6, %%mm6 \n\t"
1819 "pxor %%mm7, %%mm7 \n\t" 1823 "pxor %%mm7, %%mm7 \n\t"
1820 1824
1821 #define FIND_MIN_MAX(addr)\ 1825 #define FIND_MIN_MAX(addr)\
1822 "movq (" #addr "), %%mm0, \n\t"\ 1826 "movq " #addr ", %%mm0 \n\t"\
1823 "pminub %%mm0, %%mm6 \n\t"\ 1827 "pminub %%mm0, %%mm6 \n\t"\
1824 "pmaxub %%mm0, %%mm7 \n\t" 1828 "pmaxub %%mm0, %%mm7 \n\t"
1825 1829
1826 FIND_MIN_MAX(%0) 1830 FIND_MIN_MAX((%%eax))
1827 FIND_MIN_MAX(%%eax) 1831 FIND_MIN_MAX((%%eax, %1))
1828 FIND_MIN_MAX(%%eax, %1) 1832 FIND_MIN_MAX((%%eax, %1, 2))
1829 FIND_MIN_MAX(%%eax, %1, 2) 1833 FIND_MIN_MAX((%0, %1, 4))
1830 FIND_MIN_MAX(%0, %1, 4) 1834 FIND_MIN_MAX((%%ebx))
1831 FIND_MIN_MAX(%%ebx) 1835 FIND_MIN_MAX((%%ebx, %1))
1832 FIND_MIN_MAX(%%ebx, %1) 1836 FIND_MIN_MAX((%%ebx, %1, 2))
1833 FIND_MIN_MAX(%%ebx, %1, 2) 1837 FIND_MIN_MAX((%0, %1, 8))
1834 FIND_MIN_MAX(%0, %1, 8)
1835 FIND_MIN_MAX(%%ebx, %1, 2)
1836 1838
1837 "movq %%mm6, %%mm4 \n\t" 1839 "movq %%mm6, %%mm4 \n\t"
1838 "psrlq $8, %%mm6 \n\t" 1840 "psrlq $8, %%mm6 \n\t"
1839 "pminub %%mm4, %%mm6 \n\t" // min of pixels 1841 "pminub %%mm4, %%mm6 \n\t" // min of pixels
1840 #ifdef HAVE_MMX2 1842 #ifdef HAVE_MMX2
1864 "pmaxub %%mm4, %%mm7 \n\t" 1866 "pmaxub %%mm4, %%mm7 \n\t"
1865 "movq %%mm7, %%mm4 \n\t" 1867 "movq %%mm7, %%mm4 \n\t"
1866 "psrlq $32, %%mm7 \n\t" 1868 "psrlq $32, %%mm7 \n\t"
1867 #endif 1869 #endif
1868 "pmaxub %%mm4, %%mm7 \n\t" 1870 "pmaxub %%mm4, %%mm7 \n\t"
1869 PAVGB(%%mm6, %%mm7) // (max + min)/2 1871 PAVGB(%%mm6, %%mm7) // a=(max + min)/2
1870 "punpcklbw %%mm7, %%mm7 \n\t" 1872 "punpcklbw %%mm7, %%mm7 \n\t"
1871 "punpcklbw %%mm7, %%mm7 \n\t" 1873 "punpcklbw %%mm7, %%mm7 \n\t"
1872 "punpcklbw %%mm7, %%mm7 \n\t" 1874 "punpcklbw %%mm7, %%mm7 \n\t"
1873 1875 "movq %%mm7, temp0 \n\t"
1874 "movq (%0), %%mm0 \n\t" 1876
1875 "movq %%mm0, %%mm1 \n\t" 1877 "movq (%0), %%mm0 \n\t" // L10
1876 1878 "movq %%mm0, %%mm1 \n\t" // L10
1877 1879 "movq %%mm0, %%mm2 \n\t" // L10
1880 "psllq $8, %%mm1 \n\t"
1881 "psrlq $8, %%mm2 \n\t"
1882 "movd -4(%0), %%mm3 \n\t"
1883 "movd 8(%0), %%mm4 \n\t"
1884 "psrlq $24, %%mm3 \n\t"
1885 "psllq $56, %%mm4 \n\t"
1886 "por %%mm3, %%mm1 \n\t" // L00
1887 "por %%mm4, %%mm2 \n\t" // L20
1888 "movq %%mm1, %%mm3 \n\t" // L00
1889 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
1890 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
1891 "psubusb %%mm7, %%mm0 \n\t"
1892 "psubusb %%mm7, %%mm2 \n\t"
1893 "psubusb %%mm7, %%mm3 \n\t"
1894 "pcmpeqb b00, %%mm0 \n\t" // L10 > a ? 0 : -1
1895 "pcmpeqb b00, %%mm2 \n\t" // L20 > a ? 0 : -1
1896 "pcmpeqb b00, %%mm3 \n\t" // L00 > a ? 0 : -1
1897 "paddb %%mm2, %%mm0 \n\t"
1898 "paddb %%mm3, %%mm0 \n\t"
1899
1900 "movq (%%eax), %%mm2 \n\t" // L11
1901 "movq %%mm2, %%mm3 \n\t" // L11
1902 "movq %%mm2, %%mm4 \n\t" // L11
1903 "psllq $8, %%mm3 \n\t"
1904 "psrlq $8, %%mm4 \n\t"
1905 "movd -4(%%eax), %%mm5 \n\t"
1906 "movd 8(%%eax), %%mm6 \n\t"
1907 "psrlq $24, %%mm5 \n\t"
1908 "psllq $56, %%mm6 \n\t"
1909 "por %%mm5, %%mm3 \n\t" // L01
1910 "por %%mm6, %%mm4 \n\t" // L21
1911 "movq %%mm3, %%mm5 \n\t" // L01
1912 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
1913 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
1914 "psubusb %%mm7, %%mm2 \n\t"
1915 "psubusb %%mm7, %%mm4 \n\t"
1916 "psubusb %%mm7, %%mm5 \n\t"
1917 "pcmpeqb b00, %%mm2 \n\t" // L11 > a ? 0 : -1
1918 "pcmpeqb b00, %%mm4 \n\t" // L21 > a ? 0 : -1
1919 "pcmpeqb b00, %%mm5 \n\t" // L01 > a ? 0 : -1
1920 "paddb %%mm4, %%mm2 \n\t"
1921 "paddb %%mm5, %%mm2 \n\t"
1922 // 0, 2, 3, 1
1923 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1924 "movq " #src ", " #sx " \n\t" /* src[0] */\
1925 "movq " #sx ", " #lx " \n\t" /* src[0] */\
1926 "movq " #sx ", " #t0 " \n\t" /* src[0] */\
1927 "psllq $8, " #lx " \n\t"\
1928 "psrlq $8, " #t0 " \n\t"\
1929 "movd -4" #src ", " #t1 " \n\t"\
1930 "psrlq $24, " #t1 " \n\t"\
1931 "por " #t1 ", " #lx " \n\t" /* src[-1] */\
1932 "movd 8" #src ", " #t1 " \n\t"\
1933 "psllq $56, " #t1 " \n\t"\
1934 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1935 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
1936 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
1937 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
1938 "psubusb temp0, " #t1 " \n\t"\
1939 "psubusb temp0, " #t0 " \n\t"\
1940 "psubusb temp0, " #sx " \n\t"\
1941 "pcmpeqb b00, " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
1942 "pcmpeqb b00, " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
1943 "pcmpeqb b00, " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
1944 "paddb " #t1 ", " #t0 " \n\t"\
1945 "paddb " #t0 ", " #sx " \n\t"\
1946 \
1947 PAVGB(lx, pplx) \
1948 PAVGB(plx, pplx) /* filtered */\
1949 "movq " #dst ", " #t0 " \n\t" /* dst */\
1950 "movq " #pplx ", " #t1 " \n\t"\
1951 "psubusb " #t0 ", " #pplx " \n\t"\
1952 "psubusb " #t1 ", " #t0 " \n\t"\
1953 "por " #t0 ", " #pplx " \n\t" /* |filtered - dst| */\
1954 "psubusb pQPb2, " #pplx " \n\t"\
1955 "pcmpeqb b00, " #pplx " \n\t"\
1956 "paddb " #sx ", " #ppsx " \n\t"\
1957 "paddb " #psx ", " #ppsx " \n\t"\
1958 "#paddb b02, " #ppsx " \n\t"\
1959 "pand b08, " #ppsx " \n\t"\
1960 "pcmpeqb b00, " #ppsx " \n\t"\
1961 "pand " #pplx ", " #ppsx " \n\t"\
1962 "pand " #ppsx ", " #t1 " \n\t"\
1963 "pandn " #dst ", " #ppsx " \n\t"\
1964 "por " #t1 ", " #ppsx " \n\t"\
1965 "movq " #ppsx ", " #dst " \n\t"
1966 /*
1967 0000000
1968 1111111
1969
1970 1111110
1971 1111101
1972 1111100
1973 1111011
1974 1111010
1975 1111001
1976
1977 1111000
1978 1110111
1979
1980 */
1981 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1982 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1983 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1984 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1985 DERING_CORE((%0, %1, 4),(%%ebx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1986 DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1987 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1988 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1989 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1878 1990
1879 1991
1880 : : "r" (src), "r" (stride), "r" (QP) 1992 : : "r" (src), "r" (stride), "r" (QP)
1881 : "%eax", "%ebx" 1993 : "%eax", "%ebx"
1882 ); 1994 );
2872 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4); 2984 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
2873 int QPFrac= QPDelta; 2985 int QPFrac= QPDelta;
2874 uint8_t *tempBlock1= tempBlocks; 2986 uint8_t *tempBlock1= tempBlocks;
2875 uint8_t *tempBlock2= tempBlocks + 8; 2987 uint8_t *tempBlock2= tempBlocks + 8;
2876 #endif 2988 #endif
2877 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not 2989 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
2878 than use a temporary buffer */ 2990 if not than use a temporary buffer */
2879 if(y+15 >= height) 2991 if(y+15 >= height)
2880 { 2992 {
2881 /* copy from line 5 to 12 of src, these will e copied with 2993 /* copy from line 5 to 12 of src, these will be copied with
2882 blockcopy to dst later */ 2994 blockcopy to dst later */
2883 memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5, 2995 memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5,
2884 srcStride*MAX(height-y-5, 0) ); 2996 srcStride*MAX(height-y-5, 0) );
2885 2997
2886 /* duplicate last line to fill the void upto line 12 */ 2998 /* duplicate last line to fill the void upto line 12 */
2891 memcpy(tempSrc + srcStride*i, 3003 memcpy(tempSrc + srcStride*i,
2892 src + srcStride*(height-1), srcStride); 3004 src + srcStride*(height-1), srcStride);
2893 } 3005 }
2894 3006
2895 3007
2896 /* copy up to 5 lines of dst */ 3008 /* copy up to 6 lines of dst */
2897 memcpy(tempDst, dstBlock, dstStride*MIN(height-y, 5) ); 3009 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 6) );
2898 dstBlock= tempDst; 3010 dstBlock= tempDst + dstStride;
2899 srcBlock= tempSrc; 3011 srcBlock= tempSrc;
2900 } 3012 }
2901 3013
2902 // From this point on it is guranteed that we can read and write 16 lines downward 3014 // From this point on it is guranteed that we can read and write 16 lines downward
2903 // finish 1 block before the next otherwise weŽll might have a problem 3015 // finish 1 block before the next otherwise weŽll might have a problem
3044 T1= rdtsc(); 3156 T1= rdtsc();
3045 vertTime+= T1-T0; 3157 vertTime+= T1-T0;
3046 T0=T1; 3158 T0=T1;
3047 #endif 3159 #endif
3048 } 3160 }
3161
3049 #ifdef HAVE_MMX 3162 #ifdef HAVE_MMX
3050 transpose1(tempBlock1, tempBlock2, dstBlock, dstStride); 3163 transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
3051 #endif 3164 #endif
3052 /* check if we have a previous block to deblock it with dstBlock */ 3165 /* check if we have a previous block to deblock it with dstBlock */
3053 if(x - 8 >= 0) 3166 if(x - 8 >= 0)
3090 #ifdef MORE_TIMING 3203 #ifdef MORE_TIMING
3091 T1= rdtsc(); 3204 T1= rdtsc();
3092 horizTime+= T1-T0; 3205 horizTime+= T1-T0;
3093 T0=T1; 3206 T0=T1;
3094 #endif 3207 #endif
3095 dering(dstBlock - 9 - stride, stride, QP); 3208 if(mode & DERING)
3209 {
3210 //FIXME filter first line
3211 if(y>0) dering(dstBlock - stride - 8, stride, QP);
3212 }
3096 } 3213 }
3097 else if(y!=0) 3214 else if(mode & DERING)
3098 dering(dstBlock - stride*9 + width-9, stride, QP); 3215 {
3099 //FIXME dering filter will not be applied to last block (bottom right) 3216 //FIXME y+15 is required cuz of the tempBuffer thing -> bottom right block isnt filtered
3217 if(y > 8 && y+15 < height) dering(dstBlock - stride*9 + width - 8, stride, QP);
3218 }
3219
3100 3220
3101 #ifdef PP_FUNNY_STRIDE 3221 #ifdef PP_FUNNY_STRIDE
3102 /* did we use a tmp-block buffer */ 3222 /* did we use a tmp-block buffer */
3103 if(x+7 >= width) 3223 if(x+7 >= width)
3104 { 3224 {
3125 3245
3126 /* did we use a tmp buffer */ 3246 /* did we use a tmp buffer */
3127 if(y+15 >= height) 3247 if(y+15 >= height)
3128 { 3248 {
3129 uint8_t *dstBlock= &(dst[y*dstStride]); 3249 uint8_t *dstBlock= &(dst[y*dstStride]);
3130 memcpy(dstBlock, tempDst, dstStride*(height-y) ); 3250 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3131 } 3251 }
3132 } 3252 }
3133 #ifdef HAVE_3DNOW 3253 #ifdef HAVE_3DNOW
3134 asm volatile("femms"); 3254 asm volatile("femms");
3135 #elif defined (HAVE_MMX) 3255 #elif defined (HAVE_MMX)